require(RCurl)
# expression data
exdata <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression.txt", sep = "", header = T)
ids2 <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression_IDs_UpdtdDec2017.txt", sep = " ")
colnames(exdata) <- as.vector(ids2$x) # replace colnames with ids2
# metadata
meta2 <- read.delim(text = getURL("https://raw.githubusercontent.com/IanBrettell/AD_project/master/Working/key_metadata.txt"), header = T)
hist(meta2$Age)
library(tidyverse)
exdata1 <- as_tibble(exdata)
exdata1 <- as_tibble(t(exdata1)) # transpose to put samples in rows, genes in columns
colnames(exdata1) <- rownames(exdata) # add genes names
exdata1$AIBL.Id <- as.integer(colnames(exdata)) # add 'AIBL.Id' column
exdata2 <- left_join(meta2, exdata1, by = "AIBL.Id") %>%
na.omit # add metadata and remove PET and apoe4 columns with 'NA'
rownames(exdata2) <- exdata2$AIBL.Id # to ensure the labels in the plots reflect the AIBL.Ids
dim(exdata2)
## [1] 196 22016
exdata2[, 1:10]
## AIBL.Id Demographic.Sex PET apoe4 Age 2315554 2315633 2315674
## 4 4 Female POS 0 84.63518 6.480369 6.891905 6.565621
## 5 5 Male NEG 0 73.14168 6.314319 6.779152 6.349589
## 12 12 Male POS 1 78.88843 6.290770 6.567759 6.436518
## 14 14 Female POS 1 65.47023 5.948114 6.681805 6.175586
## 21 21 Female NEG 0 73.46475 6.037015 6.485205 6.281951
## 23 23 Female NEG 0 83.21971 6.186476 6.513650 6.219359
## 25 25 Female POS 0 79.72348 6.076614 6.657682 6.498325
## 26 26 Male POS 0 81.75770 6.328989 6.554273 6.471516
## 28 28 Male POS 1 79.87406 6.297244 6.795406 6.312110
## 29 29 Female NEG 0 80.87337 5.866402 6.558860 6.351652
## 31 31 Female NEG 0 83.25804 5.838968 6.623398 6.056068
## 33 33 Male POS 1 82.00137 6.248705 6.666009 6.349729
## 36 36 Male POS 0 88.10951 6.493032 7.002052 6.738490
## 39 39 Male POS 0 62.62834 6.167725 6.650965 6.341245
## 40 40 Female POS 1 82.72964 6.489249 6.505397 6.549479
## 42 42 Male POS 0 81.60986 6.471965 7.012865 6.925032
## 43 43 Female NEG 0 81.20739 6.268364 6.526505 6.445091
## 47 47 Female NEG 1 78.89938 6.464944 6.714745 6.362125
## 50 50 Female NEG 0 65.41273 6.406922 6.548416 6.622533
## 52 52 Female NEG 0 72.43806 6.323187 6.939356 6.567885
## 53 53 Male POS 1 79.26899 5.872199 6.444315 6.278965
## 55 55 Male POS 0 78.03696 6.576354 6.636825 6.511823
## 56 56 Male NEG 0 79.14853 6.501945 6.965795 6.661681
## 57 57 Male POS 1 77.78508 6.116833 6.320499 6.306403
## 59 59 Female POS 0 77.88638 6.382484 6.473893 6.468157
## 60 60 Female POS 0 88.07940 5.758869 6.401347 5.973544
## 61 61 Female NEG 0 79.66051 6.469268 6.596029 6.642933
## 62 62 Female POS 0 75.66872 5.887743 6.969227 6.277725
## 64 64 Female NEG 0 78.34634 6.586534 6.619251 6.520611
## 66 66 Female POS 1 69.35524 6.469714 6.977623 6.569762
## 67 67 Female NEG 1 76.04654 6.293906 6.624161 6.545438
## 68 68 Female NEG 0 74.45585 6.255258 6.709539 6.444970
## 72 72 Male POS 0 76.38330 5.723363 6.626473 6.133998
## 75 75 Male NEG 0 75.57290 5.992771 6.479666 6.180949
## 79 79 Male POS 1 73.92471 6.304891 6.642175 6.449253
## 80 80 Female NEG 0 72.26010 6.538413 6.757661 6.394387
## 86 86 Male NEG 0 73.59343 6.202429 6.891970 6.537608
## 88 88 Female NEG 0 69.75222 6.298653 6.688547 6.350240
## 89 89 Male NEG 0 68.81314 6.156755 6.845456 6.379305
## 90 90 Male NEG 0 79.77002 6.368520 6.562456 6.455850
## 91 91 Female POS 0 61.09514 6.244896 6.715811 6.325632
## 97 97 Male NEG 0 63.32649 6.406573 6.689737 6.611027
## 99 99 Male NEG 0 84.08761 6.204438 6.601850 6.847750
## 100 100 Male POS 1 77.67283 6.249671 6.784119 6.479181
## 101 101 Male POS 1 78.60917 6.217539 6.723117 6.340995
## 102 102 Female POS 1 67.27173 6.153372 6.777107 6.202899
## 103 103 Male POS 1 73.98220 6.252399 6.572031 6.304950
## 104 104 Female NEG 0 80.33128 6.249090 6.692391 6.464357
## 110 110 Male NEG 0 85.93292 6.124358 6.425423 6.278595
## 111 111 Male POS 1 83.79466 6.790819 7.086384 7.004872
## 112 112 Male POS 1 82.55168 6.104592 6.724418 6.381245
## 113 113 Female NEG 0 77.97125 6.233691 6.623124 6.583993
## 117 117 Male POS 0 80.30938 6.283379 6.765484 6.495661
## 126 126 Female NEG 0 67.69610 6.088120 6.619230 6.457667
## 138 138 Female NEG 0 60.96920 6.462123 6.682734 6.714758
## 152 152 Male NEG 0 69.33607 6.216405 6.748362 6.286094
## 259 259 Female POS 1 79.85216 6.174100 6.739155 6.429327
## 270 270 Female NEG 1 65.14168 6.465453 6.511014 6.613107
## 273 273 Male NEG 1 64.89802 6.886670 7.265324 6.773535
## 276 276 Male NEG 0 65.25120 6.202502 6.508606 6.590262
## 277 277 Male POS 1 75.00342 6.074626 6.704253 6.368754
## 278 278 Female POS 1 67.83847 6.285486 6.725421 6.338260
## 280 280 Female NEG 0 61.01027 6.142124 6.819817 6.228511
## 287 287 Male NEG 1 64.61054 6.148718 6.611755 6.329001
## 289 289 Female NEG 0 60.61328 6.303928 6.605411 6.467153
## 294 294 Male NEG 1 62.28884 6.310482 6.757581 6.524905
## 301 301 Female NEG 0 66.70773 6.250539 6.829466 6.457082
## 310 310 Male POS 1 64.13415 6.573792 7.037530 6.636816
## 315 315 Female NEG 0 67.80835 6.024107 6.884582 6.560838
## 317 317 Female NEG 0 65.22656 6.346622 6.880260 6.345604
## 331 331 Male POS 0 72.56400 6.091366 6.539202 6.147746
## 342 342 Male NEG 0 67.57837 6.459433 6.796057 6.526122
## 349 349 Female NEG 0 72.99932 6.539442 6.664520 6.724221
## 354 354 Female POS 1 83.58658 6.379879 6.828928 6.583614
## 362 362 Female NEG 0 62.76249 6.489151 6.707376 6.504605
## 365 365 Female POS 1 80.52019 6.225867 6.579102 6.291010
## 366 366 Male POS 0 77.81520 6.610969 6.654806 6.727655
## 367 367 Female NEG 1 68.91718 6.159727 6.681467 6.240086
## 371 371 Female NEG 1 66.98426 6.265874 6.729486 6.440461
## 372 372 Female POS 0 65.78508 6.302200 6.607452 6.383913
## 373 373 Female POS 1 72.54757 6.135674 6.731264 6.309723
## 374 374 Female POS 0 64.54757 6.073725 6.798658 6.481094
## 376 376 Female POS 1 76.49008 6.616582 7.186329 6.609141
## 377 377 Male POS 0 73.93018 6.023341 6.755030 6.431724
## 378 378 Female POS 0 84.12320 6.260883 6.564279 6.617119
## 380 380 Male POS 1 69.98494 6.473748 7.031872 6.634808
## 388 388 Male POS 1 66.94593 6.172192 6.542353 6.315893
## 390 390 Male NEG 1 65.20192 6.049106 6.756951 6.334876
## 394 394 Female POS 1 74.28884 6.176199 6.742148 6.347111
## 403 403 Male NEG 1 61.62355 5.885767 6.643271 6.417598
## 406 406 Male POS 1 67.80014 6.251596 6.565337 6.412665
## 407 407 Female NEG 1 62.05065 6.387949 6.526086 6.390812
## 411 411 Male NEG 1 68.22177 6.162964 6.632073 6.394948
## 471 471 Female NEG 1 69.76318 6.247959 6.553351 6.534542
## 473 473 Female POS 1 81.44285 6.248213 6.654716 6.404635
## 477 477 Male NEG 1 81.36345 6.466705 6.732609 6.596663
## 480 480 Female NEG 1 71.78097 6.157917 6.913308 6.315593
## 482 482 Male NEG 1 75.69610 6.108606 6.564574 6.381925
## 483 483 Female POS 0 63.86585 6.335418 6.907938 6.557670
## 488 488 Female POS 1 79.45243 5.981184 6.514025 6.347060
## 498 498 Female NEG 0 61.46749 6.661186 7.028357 6.705888
## 507 507 Male POS 1 76.62697 6.461849 6.400309 6.723188
## 509 509 Female NEG 0 72.96646 6.043422 6.413871 6.200552
## 510 510 Male POS 0 61.05133 6.103153 6.522613 6.080440
## 516 516 Male NEG 0 69.31691 6.452975 6.936307 6.701086
## 518 518 Male POS 1 67.14853 6.081921 6.693521 6.384823
## 528 528 Male NEG 0 67.82204 6.167301 6.604118 6.381816
## 530 530 Male NEG 0 60.82409 6.530960 6.700354 6.722223
## 550 550 Female NEG 1 71.25804 6.444818 6.926947 6.458020
## 556 556 Male NEG 0 71.60301 6.279505 6.615724 6.410287
## 557 557 Female NEG 1 68.27105 5.928358 6.802994 6.246338
## 566 566 Male NEG 0 64.10678 6.190377 6.676997 6.452718
## 567 567 Female POS 0 63.02259 6.312758 6.829371 6.695018
## 570 570 Male POS 0 77.84257 6.288248 6.505913 6.344869
## 573 573 Male NEG 0 69.02943 6.288157 6.912437 6.307653
## 582 582 Male POS 1 84.77481 6.036305 6.688572 6.164308
## 588 588 Male POS 1 62.78439 5.925922 6.585664 6.202425
## 590 590 Female POS 1 91.78919 5.927381 6.743559 6.333311
## 609 609 Female POS 0 70.54894 6.314319 6.715392 6.283178
## 615 615 Male NEG 0 77.22382 6.113175 6.582526 6.363531
## 657 657 Female POS 1 80.16975 6.131198 6.639902 6.566688
## 658 658 Female POS 0 62.09446 6.223612 6.789396 6.432003
## 661 661 Male NEG 0 69.35250 6.144306 6.567274 6.564063
## 666 666 Male NEG 0 66.18754 6.197882 6.643357 6.285649
## 676 676 Female POS 1 77.86448 6.389598 6.701799 6.555583
## 696 696 Female NEG 1 61.30322 6.111071 6.584321 6.204916
## 697 697 Male NEG 0 64.21903 6.051107 6.409244 6.329083
## 698 698 Female NEG 0 85.55510 5.885922 6.435411 6.400886
## 699 699 Male NEG 1 72.21629 6.299913 6.326878 6.504349
## 702 702 Male POS 0 82.55441 5.861619 6.613266 6.139776
## 707 707 Male NEG 0 71.25257 6.862296 7.065670 7.127398
## 716 716 Male NEG 0 68.97741 5.849523 6.586282 6.054108
## 722 722 Male NEG 0 74.89938 6.327505 6.636835 6.403534
## 736 736 Female NEG 0 71.57563 6.140351 6.557702 6.321128
## 737 737 Male NEG 0 69.07324 5.917668 6.918993 6.152954
## 757 757 Male POS 1 70.56263 5.966071 6.561388 6.119326
## 771 771 Female NEG 0 60.16427 7.373542 7.229125 7.534851
## 772 772 Female POS 0 70.32991 6.429591 6.741193 6.671919
## 784 784 Female NEG 0 72.91718 6.251406 6.624422 6.281317
## 795 795 Female POS 0 76.85421 6.502011 6.825250 6.471669
## 796 796 Female NEG 0 74.85558 6.456219 6.838002 6.571417
## 798 798 Female NEG 0 68.69268 6.445431 6.814739 6.404046
## 808 808 Male POS 1 73.20465 6.336130 6.488948 6.350209
## 814 814 Male NEG 0 82.12457 6.032602 6.831285 6.317968
## 815 815 Female POS 1 69.62902 6.594248 6.870032 6.695476
## 819 819 Female POS 1 71.47159 6.395337 6.492036 6.575456
## 838 838 Male POS 0 82.23135 6.454275 6.981217 6.755547
## 844 844 Female NEG 0 62.23956 6.470576 6.757153 6.512149
## 851 851 Male POS 1 84.49555 6.500313 6.952976 6.675960
## 877 877 Male POS 1 75.84668 6.162636 6.782607 6.369384
## 890 890 Female POS 1 76.61875 6.479070 6.563948 6.594955
## 891 891 Female NEG 0 63.95072 5.944790 6.690265 6.218748
## 895 895 Male POS 0 73.12252 6.244164 6.816867 6.378894
## 904 904 Female NEG 1 63.04723 6.260028 6.596380 6.234968
## 906 906 Male NEG 0 75.63860 6.600079 7.066462 6.583025
## 914 914 Male NEG 0 71.90418 7.033261 7.021717 6.947102
## 917 917 Male POS 1 75.71800 6.413357 6.729174 6.739102
## 918 918 Female POS 1 58.83094 6.936336 7.146546 7.222852
## 930 930 Male POS 1 71.60849 6.253725 6.712326 6.519016
## 931 931 Male NEG 0 86.10541 6.595277 6.712869 6.622484
## 938 938 Female NEG 0 60.72005 6.010315 6.689541 6.218816
## 939 939 Female POS 1 76.46270 6.445422 6.603950 6.860858
## 942 942 Female POS 1 71.88227 6.078821 6.600787 6.319793
## 945 945 Female POS 1 72.54757 5.896341 6.987305 6.120857
## 952 952 Female POS 1 85.07050 6.440366 6.812504 6.577425
## 959 959 Male POS 1 76.41342 6.623003 6.619489 6.638507
## 965 965 Female POS 0 78.98973 6.639037 6.583663 6.843239
## 970 970 Male POS 1 78.16290 6.371120 6.836754 6.519902
## 971 971 Female POS 1 84.10130 6.113747 6.612382 6.345419
## 990 990 Female POS 0 79.70431 5.968143 6.700227 6.380947
## 993 993 Male POS 0 61.79603 6.172072 6.690875 6.351244
## 1000 1000 Female NEG 1 66.87201 6.288050 6.794610 6.584681
## 1001 1001 Male NEG 0 74.12457 6.806164 6.932284 6.621855
## 1010 1010 Male NEG 0 60.22177 6.121814 6.690690 6.419014
## 1013 1013 Female POS 1 72.71458 6.246259 6.374263 6.338173
## 1032 1032 Male POS 1 75.08556 6.334371 6.562914 6.374871
## 1050 1050 Male POS 1 64.77207 5.741934 6.678802 6.012837
## 1066 1066 Female POS 1 59.05818 6.187258 6.695226 6.483559
## 1067 1067 Female NEG 0 81.62355 5.985368 6.794342 6.059024
## 1078 1078 Female POS 1 83.41958 6.028581 6.466813 6.465285
## 1089 1089 Female POS 1 59.61944 6.214467 6.816567 6.706962
## 1091 1091 Female POS 0 70.47502 6.193594 6.131500 6.385019
## 1094 1094 Male POS 1 68.89528 6.025145 6.694290 6.502906
## 1096 1096 Male POS 1 71.56742 5.949029 6.581726 6.153074
## 1100 1100 Male POS 1 69.91376 6.233982 6.698891 6.205360
## 1102 1102 Male POS 1 71.41684 5.893660 6.551811 6.189609
## 1104 1104 Male NEG 0 72.42163 6.124993 6.534687 6.409842
## 1109 1109 Male POS 0 78.26694 6.145877 6.601277 6.405181
## 1122 1122 Male POS 1 60.35866 5.676284 6.788442 6.196706
## 1125 1125 Female NEG 0 85.11978 6.204285 6.678693 6.388874
## 1135 1135 Female POS 1 79.79466 5.934548 6.552959 6.248266
## 1139 1139 Male POS 1 55.14853 6.016007 6.934537 6.251933
## 1147 1147 Male NEG 0 60.76934 6.108658 6.714836 6.346043
## 1153 1153 Female NEG 0 75.75633 6.305337 6.887700 6.502986
## 1157 1157 Male NEG 1 67.60301 6.018838 6.725353 6.255278
## 1174 1174 Male POS 0 83.76728 6.162833 6.799695 6.459972
## 2315739 2315894
## 4 6.516567 8.176795
## 5 6.252506 8.635744
## 12 6.384537 7.963935
## 14 6.301732 8.043594
## 21 6.129048 7.945633
## 23 6.133969 7.968608
## 25 6.367791 8.240272
## 26 6.460154 8.756544
## 28 6.379973 8.661552
## 29 6.172999 8.285472
## 31 6.235320 8.119470
## 33 6.215513 8.060683
## 36 6.578067 8.828091
## 39 6.318249 8.426085
## 40 6.488439 8.371069
## 42 6.666519 8.786018
## 43 6.197191 8.188000
## 47 6.445460 8.476699
## 50 6.402781 8.389101
## 52 6.548507 8.399823
## 53 6.191993 8.172368
## 55 6.373156 8.389860
## 56 6.542421 8.621589
## 57 6.286881 8.231618
## 59 6.396478 8.412143
## 60 6.042320 8.042496
## 61 6.383923 7.818437
## 62 6.248329 8.142817
## 64 6.475421 8.077885
## 66 6.460657 8.267160
## 67 6.388726 8.091714
## 68 6.591385 8.389016
## 72 6.089050 7.638819
## 75 6.245394 8.029725
## 79 6.652024 8.249711
## 80 6.474735 8.007391
## 86 6.402931 8.297299
## 88 6.503762 8.521326
## 89 6.401387 8.665788
## 90 6.191511 8.089617
## 91 6.426800 8.399407
## 97 6.720172 8.717953
## 99 6.581944 8.220909
## 100 6.275917 7.936677
## 101 6.360023 8.455458
## 102 6.387004 8.135961
## 103 6.189617 8.048258
## 104 6.667132 8.060595
## 110 6.160518 8.067879
## 111 6.966086 8.967732
## 112 6.385202 8.012695
## 113 6.463539 8.460742
## 117 6.424220 8.395555
## 126 6.399296 8.219280
## 138 6.595803 9.324923
## 152 6.433069 8.130344
## 259 6.311515 8.487506
## 270 6.485162 7.571880
## 273 6.632758 8.556086
## 276 6.411020 7.488878
## 277 6.286240 8.469508
## 278 6.145291 8.382221
## 280 6.137446 8.230918
## 287 6.449422 8.505350
## 289 6.443845 8.325384
## 294 6.487336 7.854833
## 301 6.402543 8.157133
## 310 6.381600 7.875573
## 315 6.412068 8.418653
## 317 6.468000 8.143446
## 331 6.329408 8.128533
## 342 6.514058 8.658526
## 349 6.767479 8.098890
## 354 6.646620 7.959876
## 362 6.273409 8.596950
## 365 6.249836 7.895243
## 366 6.688523 8.673346
## 367 6.550660 8.093363
## 371 6.227942 7.830658
## 372 6.433531 8.095706
## 373 6.357371 8.256031
## 374 6.398171 8.329469
## 376 6.806113 8.325484
## 377 6.619245 8.362890
## 378 6.077606 8.060377
## 380 6.437845 8.313434
## 388 6.127493 8.013600
## 390 6.445356 8.181027
## 394 6.451033 8.369798
## 403 6.241045 7.688826
## 406 6.464875 8.119255
## 407 6.427262 8.448072
## 411 6.307937 8.530749
## 471 6.534982 8.574288
## 473 6.391032 8.577297
## 477 6.531321 8.138791
## 480 6.361480 8.353346
## 482 6.062568 8.005850
## 483 6.391227 8.274258
## 488 6.268688 8.123853
## 498 6.560862 8.571118
## 507 6.617919 8.555241
## 509 6.238213 7.816772
## 510 6.092995 8.371996
## 516 6.620882 8.796185
## 518 6.232935 8.199298
## 528 6.381270 8.076391
## 530 6.600723 8.458335
## 550 6.564602 8.257652
## 556 6.560507 8.324118
## 557 6.294528 8.458825
## 566 6.139174 8.258713
## 567 6.631280 8.579966
## 570 6.298944 8.309118
## 573 6.363114 8.533878
## 582 6.484227 8.350568
## 588 6.167237 8.102750
## 590 6.105932 8.145313
## 609 6.358180 8.711746
## 615 6.347374 7.808396
## 657 6.630271 8.411060
## 658 6.295594 8.668870
## 661 6.427691 8.158172
## 666 6.204427 7.766536
## 676 6.761706 8.707959
## 696 6.237833 8.388425
## 697 6.091262 8.242050
## 698 6.206039 8.296143
## 699 6.231725 7.766098
## 702 6.187461 8.181405
## 707 6.797781 8.444630
## 716 6.266113 8.082044
## 722 6.650641 8.631772
## 736 6.373879 8.317890
## 737 6.386691 8.162499
## 757 6.148482 8.317301
## 771 6.949544 9.848143
## 772 6.555872 8.299831
## 784 6.453931 8.557491
## 795 6.580387 8.465663
## 796 6.407076 8.677469
## 798 6.631139 8.378032
## 808 6.476528 8.268881
## 814 6.302563 8.518802
## 815 6.621854 8.654187
## 819 6.292928 8.189562
## 838 6.847003 8.376027
## 844 6.317192 8.259769
## 851 6.542828 8.538429
## 877 6.447839 8.211258
## 890 6.403709 8.523652
## 891 6.373590 8.380272
## 895 6.359654 8.045609
## 904 6.342941 8.058783
## 906 6.600772 8.721166
## 914 6.914673 8.860534
## 917 6.528452 8.671336
## 918 6.929756 9.457507
## 930 6.374662 8.126438
## 931 6.329473 8.566715
## 938 6.268462 7.888268
## 939 6.401725 8.699766
## 942 6.268897 8.011411
## 945 6.698476 8.136579
## 952 6.535240 8.552151
## 959 6.650298 8.245840
## 965 6.573021 8.112980
## 970 6.405051 8.644900
## 971 6.082821 7.816655
## 990 6.624148 8.154736
## 993 6.283437 8.478330
## 1000 6.590041 8.503554
## 1001 6.580209 8.622928
## 1010 6.404956 8.200976
## 1013 6.394821 8.342434
## 1032 6.315333 8.551981
## 1050 6.171498 7.929635
## 1066 6.204242 8.388649
## 1067 6.005664 8.000128
## 1078 5.974082 7.463696
## 1089 6.371726 9.151203
## 1091 6.120406 7.768187
## 1094 6.578118 8.651970
## 1096 6.331987 8.352128
## 1100 6.490084 8.171842
## 1102 6.346412 8.135509
## 1104 6.249486 7.952976
## 1109 6.498179 8.369956
## 1122 6.375254 8.363321
## 1125 6.307210 7.882459
## 1135 6.136527 8.130718
## 1139 6.366817 8.260760
## 1147 6.330977 8.552363
## 1153 6.466313 8.538178
## 1157 6.368882 8.001666
## 1174 6.704779 8.590583
Run PCA
pca <- prcomp(exdata2[6:22016], center = T, scale. = T) # run PCA
# plot scree for PCA
library(factoextra)
fviz_eig(pca)
PCA scatterplot for PET status
library(ggfortify)
autoplot(pca, data = exdata2, colour = "PET", label = T) +
viridis::scale_colour_viridis(option = "viridis", discrete = T)
PET scatterplot for apoe4 status
library(ggfortify)
autoplot(pca, data = exdata2, colour = "apoe4", label = T)
qqnorm(pca$x[,2], pch = 20)
Exclude outliers and run PCA again
exdata3 <- filter(exdata2, AIBL.Id != "771" & AIBL.Id != "914" & AIBL.Id != "918")
Run PCA
pca2 <- prcomp(exdata3[6:22016], center = T, scale. = T) # run PCA
library(factoextra)
fviz_eig(pca2)
Plot without outliers (PET)
library(ggfortify)
autoplot(pca2, data = exdata3, colour = "PET", label = T, label.label = "AIBL.Id") +
viridis::scale_colour_viridis(option = "viridis", discrete = T)
Plot without outliers (apoe4)
library(ggfortify)
autoplot(pca2, data = exdata3, colour = "apoe4", label = T, label.label = "AIBL.Id")
limma to analyse expression data.We first create an ‘ExpressionSet’ with genes on rows, and samples on columns, following the guide here: https://www.bioconductor.org/packages/3.7/bioc/vignettes/Biobase/inst/doc/ExpressionSetIntroduction.pdf
library(Biobase)
exprs <- as.matrix(t(exdata3[,6:22016])) # leaving behind the first 5 metadata columns
minimalSet <- ExpressionSet(assayData = exprs)
pData <- exdata3[,1:5]
summary(pData)
## AIBL.Id Demographic.Sex PET apoe4
## Min. : 4.0 Female:96 NEG: 92 Min. :0.000
## 1st Qu.: 110.0 Male :97 POS:101 1st Qu.:0.000
## Median : 480.0 Median :0.000
## Mean : 506.2 Mean :0.456
## 3rd Qu.: 819.0 3rd Qu.:1.000
## Max. :1174.0 Max. :1.000
## Age
## Min. :55.15
## 1st Qu.:67.15
## Median :72.71
## Mean :72.86
## 3rd Qu.:78.90
## Max. :91.79
library(limma)
PET_apoe4 <- paste(pData$PET, pData$apoe4, sep = ".") # put PET/apoe4 combinations into a vector
PET_apoe4 <- factor(PET_apoe4, levels = c(unique(PET_apoe4))) # turn it into a factor
design <- model.matrix(~0 + PET_apoe4) # create factor table for four combinations
colnames(design) <- levels(PET_apoe4)
fit <- lmFit(exprs, design)
Set four pair-wise contrasts of interest and compute the contrasts and moderated t-tests.
cont.matrix <- makeContrasts(apoe4_for_PET_yes = POS.1 - POS.0,
apoe4_for_PET_no = NEG.1 - NEG.0,
PET_for_apoe4_yes = POS.1 - NEG.1,
PET_for_apoe4_no = POS.0 - NEG.0,
levels = design)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
Get the genes differentially expressed in each comparison
res1 <- topTable(fit2, coef = "apoe4_for_PET_yes", n = Inf)
res2 <- topTable(fit2, coef = "apoe4_for_PET_no", n = Inf)
res3 <- topTable(fit2, coef = "PET_for_apoe4_yes", n = Inf)
res4 <- topTable(fit2, coef = "PET_for_apoe4_no", n = Inf)
length(which(res1$P.Value < 0.05))
## [1] 653
length(which(res2$P.Value < 0.05))
## [1] 684
length(which(res3$P.Value < 0.05))
## [1] 924
length(which(res4$P.Value < 0.05))
## [1] 769
Probe IDs are set in the ‘res’ data frames as rownames. Create separate column for rownames. Need to do it now, because when filter() is applied below, it removes the rownames.
for (i in ls(pattern = "res")){
x <- get(i)
x$probe_id <- rownames(x)
assign(i, x)
}
Get list of genes that show significantly different expression between contrasts
signif_apoe4_for_PET_yes <- filter(res1, P.Value < 0.05)
signif_apoe4_for_PET_no <- filter(res2, P.Value < 0.05)
signif_PET_for_apoe4_yes <- filter(res3, P.Value < 0.05)
signif_PET_for_apoe4_no <- filter(res4, P.Value < 0.05)
rm(list = ls(pattern = "res")) # remove results to clear working memory
Using JD’s code from ‘AIBL_Gene_Expression_SetUp_Dec82017.R’
library(pd.huex.1.0.st.v2)
library(huex10sttranscriptcluster.db)
### pull gene symbols and match to probe ids to replace rownames ###
x <- huex10sttranscriptclusterSYMBOL
# Get the probe identifiers that are mapped to a gene symbol
mapped_probes <- mappedkeys(x)
# Convert to a list
xx <- as.list(x[mapped_probes])
if(length(xx) > 0) {
# Get the SYMBOL for the first five probes
xx[1:5]
# Get the first one
xx[[1]]
}
## [1] "LOC102723600"
xx1 <- as.matrix(unlist(xx))
# Get table of probes
gns <- rownames(exdata)
gnsl <- xx1[rownames(xx1) %in% gns]
length(gnsl)
## [1] 14902
### Find ENSEMBL IDs ###
x3 <- huex10sttranscriptclusterENSEMBL
# Get the entrez gene IDs that are mapped to an Ensembl ID
mapped_genes <- mappedkeys(x3)
# Convert to a list
xx3 <- as.list(x3[mapped_genes])
xx3.1 <- as.matrix(unlist(xx3))
ens <- xx3.1[rownames(xx3.1) %in% gns]
length(ens)
## [1] 13954
### Find Entrez IDs ###
x4 <- huex10sttranscriptclusterENTREZID
# Get the probe identifiers that are mapped to an ENTREZ Gene ID
mapped_probes <- mappedkeys(x4)
# Convert to a list
xx4 <- as.list(x4[mapped_probes])
xx4.1 <- as.matrix(unlist(xx4))
entz <- xx4.1[rownames(xx4.1) %in% gns]
length(entz)
## [1] 14902
Create data frame of all probe matches with identifiers
pid_symb <- data.frame(xx1)
pid_symb$probe_id <- rownames(pid_symb)
pid_ens <- data.frame(xx3.1)
pid_ens$probe_id <- rownames(pid_ens)
pid_ent <- data.frame(xx4.1)
pid_ent$probe_id <- rownames(pid_ent)
pid_all <- dplyr::left_join(pid_symb, pid_ent, by = "probe_id") %>%
dplyr::left_join(y = pid_ens, by = "probe_id") %>%
dplyr::rename(gene_symbol = "xx1",
entrez_id = "xx4.1",
ensembl_id = "xx3.1") %>%
dplyr::select(probe_id, everything())
# write.table(pid_all, "~/Desktop/Uni Air/CSIRO/AD_project/Working/affy_probes_annotated.txt", row.names = F, col.names = T, quote = F, sep = "\t")
# find how many probes are annotated
length(which(pid_all$probe_id %in% rownames(exdata)))
## [1] 14902
Combine lists of significant genes for each contrast. Note that combined counts of significant genes for all four contrasts is 3030.
signif_all <- full_join(signif_apoe4_for_PET_yes, signif_apoe4_for_PET_no, by = "probe_id") %>%
full_join(signif_PET_for_apoe4_no, by = "probe_id") %>%
full_join(signif_apoe4_for_PET_yes, by = "probe_id")
signif_all <- dplyr::select(signif_apoe4_for_PET_yes, probe_id, P.Value) %>%
full_join(dplyr::select(signif_apoe4_for_PET_no, probe_id, P.Value), by = "probe_id") %>%
full_join(dplyr::select(signif_PET_for_apoe4_yes, probe_id, P.Value), by = "probe_id") %>%
full_join(dplyr::select(signif_PET_for_apoe4_no, probe_id, P.Value), by = "probe_id") %>%
dplyr::rename(P.Value.c1 = "P.Value.x",
P.Value.c2 = "P.Value.y",
P.Value.c3 = "P.Value.x.x",
P.Value.c4 = "P.Value.y.y")
nrow(signif_all)
## [1] 2438
Therefore 2438 genes are expressed differentially across at least two contrasts
Bind gene ids to ‘signif_all’
sig_all2 <- left_join(signif_all, pid_all, by = "probe_id")
There are many NAs in the ‘gene_symbol’ column, even though there are 34,127 unique probe_ids in ‘pid_symb’ - one would think they would all be accounted for.
length(setdiff(signif_all$probe_id, pid_all$probe_id))
## [1] 989
Shows that there are 989 probe ids in ‘signif_all’ that are not in ‘pid_all’, i.e. they do not have any annotation data.
Confirmed by:
length(which(signif_all$probe_id %in% pid_all$probe_id == "TRUE"))
## [1] 1449
So are all probe_ids in the expression data in ‘pid_symb’?
length(setdiff(rownames(exdata), pid_symb$probe_id))
## [1] 7109
7109/22011 probes are not accounted for in the annotation data?
Confirm this:
length(which(rownames(exdata) %in% pid_all$probe_id == "TRUE"))
## [1] 14902
library(VennDiagram)
grid.newpage()
venn <- draw.quad.venn(area1 = length(which(!is.na(sig_all2$P.Value.c1))),
area2 = length(which(!is.na(sig_all2$P.Value.c2))),
area3 = length(which(!is.na(sig_all2$P.Value.c3))),
area4 = length(which(!is.na(sig_all2$P.Value.c4))),
n12 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c2))),
n13 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c3))),
n14 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c4))),
n23 = length(which(!is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c3))),
n24 = length(which(!is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c4))),
n34 = length(which(!is.na(sig_all2$P.Value.c3) & !is.na(sig_all2$P.Value.c4))),
n123 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c3))),
n124 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c4))),
n134 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c3) & !is.na(sig_all2$P.Value.c4))),
n234 = length(which(!is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c3) & !is.na(sig_all2$P.Value.c4))),
n1234 = length(which(!is.na(sig_all2$P.Value.c1) & !is.na(sig_all2$P.Value.c2) & !is.na(sig_all2$P.Value.c3) & !is.na(sig_all2$P.Value.c4))),
category = c("apoe4_for_PET_yes", "apoe4_for_PET_no", "PET_for_apoe4_yes", "PET_for_apoe4_no"),
cat.pos = c(0, 0, 0, 0))
Many of the same genes (265) are differentially expressed between: * PET pos/neg when they have the apoe4 allele; and * apoe4 yes/no when they are negative for PET
These could be genes that can protect against AD.
Many of the same genes (189) are also differentially expressed between: * PET pos/neg when they do not have the apoe4 allele; and * apoe4 yes/no when they are positive for PET
These latter genes must be acting independently of the apoe4 network to cause AD.
Find out how many people are in each group:
test <- exdata3[,1:5]
length(which(test$PET == "POS" & test$apoe4 == "1"))
## [1] 65
length(which(test$PET == "POS" & test$apoe4 == "0"))
## [1] 36
length(which(test$PET == "NEG" & test$apoe4 == "1"))
## [1] 23
length(which(test$PET == "NEG" & test$apoe4 == "0"))
## [1] 69
This suggests that more people tend to either have apoe4 and AD, or not have apoe4 and not have AD, than otherwise.
Create design matrix
Group <- factor(exdata3$PET, levels = c("POS", "NEG"))
design2 <- model.matrix(~0 + Group)
colnames(design2) <- c("POS", "NEG")
Find differentially expressed genes
fit3 <- lmFit(exprs, design2)
cont.matrix2 <- makeContrasts(POSvsNEG = POS - NEG, levels = design2)
fit4 <- contrasts.fit(fit3, cont.matrix2)
fit4 <- eBayes(fit4)
topTable(fit4, adjust = "BH")
## logFC AveExpr t P.Value adj.P.Val B
## 2328611 -0.24349043 6.931149 -3.781354 0.0002074070 0.9997102 0.50013583
## 3997946 0.14334623 9.035701 3.648961 0.0003381087 0.9997102 0.07930919
## 2566383 -0.08138720 8.334744 -3.626837 0.0003664051 0.9997102 0.01024134
## 2694314 0.10608542 6.813847 3.553376 0.0004772006 0.9997102 -0.21648884
## 3707258 0.06258903 6.911761 3.470792 0.0006390722 0.9997102 -0.46656387
## 3957224 0.08050569 6.324961 3.469708 0.0006415049 0.9997102 -0.46981262
## 3934479 0.07847492 6.465840 3.468142 0.0006450341 0.9997102 -0.47450352
## 3819771 0.21167767 5.350143 3.463788 0.0006549411 0.9997102 -0.48753481
## 3462630 -0.07185294 3.382152 -3.439265 0.0007134666 0.9997102 -0.56067593
## 4038494 0.10130047 3.711608 3.427094 0.0007442989 0.9997102 -0.59680772
results_PETposvneg <- topTable(fit4, n = Inf)
results_PETposvneg$probe_id <- rownames(results_PETposvneg)
signif_PET_pos_v_neg <- filter(results_PETposvneg, P.Value < 0.05)
dim(signif_PET_pos_v_neg)
## [1] 865 7
Now for apoe4
Group <- factor(exdata3$apoe4, levels = c(0, 1))
design3 <- model.matrix(~0 + Group)
colnames(design3) <- c("NEG", "POS")
Find differentially expressed genes
fit5 <- lmFit(exprs, design3)
cont.matrix3 <- makeContrasts(apoe4 = NEG - POS, levels = design3)
fit6 <- contrasts.fit(fit5, cont.matrix3)
fit6 <- eBayes(fit6)
topTable(fit6, adjust = "BH")
## logFC AveExpr t P.Value adj.P.Val B
## 3360287 0.17135398 6.377782 4.107229 5.892379e-05 0.9997616 1.58861565
## 3453446 0.20354250 4.358425 3.702309 2.781176e-04 0.9997616 0.24693729
## 3275398 0.27781363 6.503623 3.643283 3.451692e-04 0.9997616 0.06124851
## 3749570 0.18071224 6.920419 3.505805 5.650003e-04 0.9997616 -0.36123353
## 3450180 -0.08479851 5.565530 -3.475845 6.278481e-04 0.9997616 -0.45142875
## 3719456 0.15291395 3.122911 3.416413 7.723809e-04 0.9997616 -0.62834664
## 3150579 -0.15573427 5.935585 -3.391778 8.409687e-04 0.9997616 -0.70089559
## 2777276 0.07471033 4.025781 3.335889 1.018232e-03 0.9997616 -0.86377471
## 2342904 0.09381612 5.328055 3.331353 1.034053e-03 0.9997616 -0.87689050
## 2432714 -0.20563079 6.724809 -3.232473 1.441337e-03 0.9997616 -1.15884979
results_apoe4posvneg <- topTable(fit6, n = Inf)
results_apoe4posvneg$probe_id <- rownames(results_apoe4posvneg)
signif_apoe4_pos_v_neg <- filter(results_apoe4posvneg, P.Value < 0.05)
dim(signif_apoe4_pos_v_neg)
## [1] 554 7
Create table of all significant differentially expressed genes
sig_direct_pairs <- full_join(
dplyr::select(signif_PET_pos_v_neg, P.Value, probe_id),
dplyr::select(signif_apoe4_pos_v_neg, P.Value, probe_id),
by = "probe_id") %>%
dplyr::select(probe_id, everything())
colnames(sig_direct_pairs)[2] <- "P.Value_PET"
colnames(sig_direct_pairs)[3] <- "P.Value_apoe4"
library(VennDiagram)
grid.newpage()
venn2 <- draw.pairwise.venn(
area1 = length(which(!is.na(sig_direct_pairs$P.Value_PET))),
area2 = length(which(!is.na(sig_direct_pairs$P.Value_apoe4))),
cross.area = length(which(!is.na(sig_direct_pairs$P.Value_PET) & !is.na(sig_direct_pairs$P.Value_apoe4))),
category = c("PET_pos_v_neg", "apoe4_pos_v_neg"),
ext.text = FALSE, cat.prompts = T, cat.pos = 0)
## INFO [2018-02-28 15:49:55] Placing category labels at default outer locations. Use 'cat.pos' and 'cat.dist' to modify location.
## INFO [2018-02-28 15:49:55] Current 'cat.pos': 0 degrees, 0 degrees
## INFO [2018-02-28 15:49:55] Current 'cat.dist': 0.025 , 0.025
Import IDs for genotyped SNPs and proxy SNPs (those in LD with the genotyped SNPs), and annotate them with gene names
install.packages("tidyverse")
source("https://bioconductor.org/biocLite.R")
biocLite()
biocLite("biomaRt")
biocLite("GenomicRanges")
library(GenomicRanges)
biocLite("Homo.sapiens")
library(Homo.sapiens)
biocLite("snpStats")
install.packages("MatrixEQTL")
aibl_snps <- read.delim("~/Dropbox/eQTL/Data/Expanded SNP set/AIBLgene_SNP_LIST_04032015.tsv")
snap_edit <- read.delim("~/Dropbox/eQTL/Data/Expanded SNP set/snapResults_edit.txt")
snap <- read.delim("~/Dropbox/eQTL/Data/Expanded SNP set/snapResults.txt")
Note: ‘snap_edit’ has removed all SNPs from ‘snap’ that did not have proxies.
# How many SNPs are there in snap_edit?
length(unique(snap_edit$SNP)) # Shows that 2084 - 1527 = 557 have been removed from aibl_snps.
## [1] 1527
# How many genes are there in aibl_snps?
length(unique(aibl_snps$GENE))
## [1] 298
# How many aibl_snps don't have an associated gene?
which(aibl_snps$GENE == "")
## [1] 116 464 511 512 668 675 788 856 877 896 897 960 1342 1409
## [15] 1612 1989
# One entry has "-" in the gene column.
which(aibl_snps$GENE == "-") # shows that there is 1 additional SNP with no associated gene
## [1] 415
length(setdiff(aibl_snps$SNP, snap_edit$SNP)) # same number as above - 557 removed.
## [1] 557
# How many unique proxy SNPs?
length(unique(snap_edit$Proxy))
## [1] 18118
library(tidyverse)
AIBLdummy <- dplyr::select(aibl_snps, SNP, GENE) # creates dummy data frame with just two variables
snap2 <- left_join(snap_edit, AIBLdummy, by = "SNP") # combine data frames
rm(AIBLdummy) # remove dummy
snap2 <- dplyr::select(snap2, SNP, SNP_gene = "GENE", dplyr::everything())# reorder columns
Get ensembl ids and loci from BioMart
snps <- as.vector(aibl_snps$SNP) # Create vector of aibl_snps
library(biomaRt)
listMarts()
mart_snps <- useMart('ENSEMBL_MART_SNP')
listDatasets(mart_snps) # lists datasets available
mart_snps <- useMart('ENSEMBL_MART_SNP', 'hsapiens_snp')
SNP_genes <- getBM(attributes = c("refsnp_id", "allele_1", "minor_allele", "minor_allele_freq", "chr_name", "chrom_start", "chrom_end", "ensembl_gene_stable_id"),
filters = "snp_filter",
values = snps,
mart = mart_snps)
See how many SNPs returned results
length(unique(SNP_genes$refsnp_id))
## [1] 2075
Use ‘ensembl_ids’ to get loci and hgnc symbols for genes
ensembl_ids <- unique(SNP_genes$ensembl_gene_stable_id) # Create `ensembl_ids' vector of unique ensembl gene ids from 'SNP_genes'
listMarts()
mart_genes <- useMart('ENSEMBL_MART_ENSEMBL')
listDatasets(mart_genes)
mart_genes <- useMart('ENSEMBL_MART_ENSEMBL', 'hsapiens_gene_ensembl')
ensembl_genes <- getBM(attributes = c("ensembl_gene_id", "chromosome_name", "start_position", "end_position", "strand", "hgnc_symbol", "entrezgene"),
filters = "ensembl_gene_id",
values = ensembl_ids,
mart = mart_genes)
Bind both tables by ensembl_id to give full table of genes associated with the AIBL snps
SNP_genes_full <- dplyr::left_join(SNP_genes, ensembl_genes, by = c("ensembl_gene_stable_id" = "ensembl_gene_id"))
Remove duplicate rows with “LRG…” in the ‘ensembl_gene_stable_id’ column
SNP_genes_full <- SNP_genes_full[-grep("LRG", SNP_genes_full$ensembl_gene_stable_id), ]
dim(SNP_genes_full)
## [1] 3492 14
length(unique(SNP_genes_full$refsnp_id))
## [1] 2075
length(unique(SNP_genes_full$ensembl_gene_stable_id))
## [1] 621
# SNP_genes_full$ensembl_gene_stable_id has blank entries - replace with 'NA'
SNP_genes_full$ensembl_gene_stable_id[SNP_genes_full$ensembl_gene_stable_id == ""] <- NA
length(unique(SNP_genes_full$refsnp_id[!is.na(SNP_genes_full$ensembl_gene_stable_id)]))
## [1] 1994
# do the same for hgnc_symbol
SNP_genes_full$hgnc_symbol <- as.character(SNP_genes_full$hgnc_symbol) # convert to character
SNP_genes_full$hgnc_symbol[SNP_genes_full$hgnc_symbol == ""] <- NA # replace blank values with NA
## write table
# write.table(SNP_genes_full, "~/Desktop/Uni Air/CSIRO/AD_project/Working/aibl_snp_genes.txt", sep = "\t", col.names = T, row.names = F, quote = F)
Get ensembl ids and loci for proxies
proxy_snps <- as.vector(unique(snap_edit$Proxy)) # Create vector of proxy snps
length(proxy_snps)
listMarts()
mart_snps <- useMart('ENSEMBL_MART_SNP')
listDatasets(mart_snps) # lists datasets available
mart_snps <- useMart('ENSEMBL_MART_SNP', 'hsapiens_snp')
proxy_genes <- getBM(attributes = c("refsnp_id", "chr_name", "chrom_start", "chrom_end", "ensembl_gene_stable_id"),
filters = "snp_filter",
values = proxy_snps,
mart = mart_snps)
See how many SNPs returned results loci information
length(unique(proxy_genes$refsnp_id))
## [1] 17847
Remove rows with “LRG” in ‘ensemble_gene_stable_id’
proxy_genes <- proxy_genes[-grep("LRG", proxy_genes$ensembl_gene_stable_id), ]
dim(proxy_genes)
## [1] 24632 5
Create `ensembl_ids’ vector of unique ensembl gene ids from ‘proxy_genes’
ensembl_ids_proxies <- unique(proxy_genes$ensembl_gene_stable_id)
length(ensembl_ids_proxies)
## [1] 1103
Use ‘ensembl_ids’ to get loci and hgnc symbols for genes
listMarts()
mart_genes <- useMart('ENSEMBL_MART_ENSEMBL')
listDatasets(mart_genes)
mart_genes <- useMart('ENSEMBL_MART_ENSEMBL', 'hsapiens_gene_ensembl')
ensembl_proxy_genes <- getBM(attributes = c("ensembl_gene_id", "chromosome_name", "start_position", "end_position", "strand", "hgnc_symbol"),
filters = "ensembl_gene_id",
values = ensembl_ids_proxies,
mart = mart_genes)
Bind both tables by ensembl_id to give full table of genes associated with the proxy SNPs
proxy_genes_full <- left_join(proxy_genes, ensembl_proxy_genes, by = c("ensembl_gene_stable_id" = "ensembl_gene_id"))
dim(proxy_genes_full)
## [1] 24632 10
# substitute blank values in hgnc_symbol column with NA
proxy_genes_full$hgnc_symbol <- as.character(proxy_genes_full$hgnc_symbol) # convert to character
proxy_genes_full$hgnc_symbol[proxy_genes_full$hgnc_symbol == ""] <- NA # replace blank values with NA
proxy_gns_uq <- proxy_genes_full[is.na(proxy_genes_full$hgnc_symbol) == F, ] # create new data frame with SNPs that have been annotated with gene symbols
# get unique values
length(unique(proxy_gns_uq$refsnp_id))
## [1] 15255
length(unique(proxy_gns_uq$hgnc_symbol))
## [1] 795
## write table
# write.table(proxy_genes_full, "~/Desktop/Uni Air/CSIRO/AD_project/Working/proxy_snp_gns.txt", row.names = F, col.names = T, quote = F, sep = "\t")
Find out how many unique genes there are in the SNP and proxy lists
length(unique(SNP_genes_full$ensembl_gene_stable_id))
## [1] 621
length(unique(proxy_genes_full$ensembl_gene_stable_id))
## [1] 1103
Find out how many proxy genes are different from the SNP genes, and vice versa
length(setdiff(SNP_genes_full$ensembl_gene_stable_id, proxy_genes_full$ensembl_gene_stable_id))
## [1] 127
length(setdiff(proxy_genes_full$ensembl_gene_stable_id, SNP_genes_full$ensembl_gene_stable_id))
## [1] 609
So 127/621 genes in the AIBL SNP list are not in the proxy list, and 609/1103 genes in the proxy list are not in the SNP list.
library(readxl)
gtypes <- read_excel("~/Dropbox/eQTL/Data/Expanded SNP set/AIBLgene_SNP_Data_by_Gene_Expression_IDs.xlsx",
col_names = F)
exdata <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression.txt", sep = "", header = T)
Note that the AIBL IDs are incorrect in the excel file. We attach the correct IDs using the ‘AIBL_Gene_Expression_IDs_UpdtdDec2017.txt’ file.
ids2 <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression_IDs_UpdtdDec2017.txt", sep = " ")
# bind colnames for exdata to new IDs
ids2$old_ids <- as.integer(gsub("X", "", colnames(exdata)))
# reorder to same order as the SNP genotype data
ids2 <- ids2[order(ids2$old_ids), ]
# merge with rownames of genotype data to ensure perfect match
ids3 <- data.frame(gtypes[5:nrow(gtypes), 1])# extract AIBL IDs from genotype data
ids3$X__1 <- as.integer(ids3$X__1)
ids3 <- merge(ids3, ids2, by.x = "X__1", by.y = "old_ids")
# add new column names to the genotype data
ids2 <- as.vector(ids3$x)
gtypes$AIBL_ID_new <- c(NA, NA, NA, "AIBL_ID_new", as.vector(ids2))
gtypes <- dplyr::select(gtypes, AIBL_ID_new, dplyr::everything()) # reorder to bring AIBL_ID_new to the front
gts <- as.tibble(t(gtypes)) # transpose data frame
gts <- as.tibble(lapply(gts, function(x) { # convert all 'X_X' to NA
gsub("X_X", NA, x)
}))
Because some individuals failed to record a result for some SNPs in addition to the few individuals for whom no results were recorded, we will manually remove the latter.
colnames(gts) <- gts[3, ] # make the column names the (incorrect) AILB IDs
# some individuals have data missing for ~1,500 of the ~2,000 SNPs, e.g.
length(which(is.na(gts$`127`)) == TRUE)
## [1] 1515
x <- sapply(gts, function(x) { # gets TRUE or FALSE for each column with data missing for more than 1000 SNPs
length(which(is.na(x) == TRUE)) > 1000
})
y <- which(x == TRUE) # creates vector of column indexes for which it is true
gts[y] <- NULL # removes those columns
colnames(gts) <- c("chromosome", "position", "gene", "snp", gts[1, 5:ncol(gts)]) # replace column names with correct AIBL IDs
gts <- gts[-c(1:3), ] # remove extraneous first three rows
Write table to working folder
# write.table(gts, file = "~/Desktop/Uni Air/CSIRO/AD_project/Working/genotype_data.txt", sep = "\t")
In accordance with the guide here: http://zzz.bwh.harvard.edu/plink/data.shtml.
Create PED file
ped <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/genotype_data.txt", header = T)
# find unique values across genotype data
as.character(unique(unlist(ped[, 5:ncol(ped)])))
## [1] "C_C" "G_G"
## [3] "A_A" "T_T"
## [5] "A_G" "T_G"
## [7] "A_C" "T_C"
## [9] "C_T" "A_T"
## [11] "G_A" NA
## [13] "G_T" "C_G"
## [15] "G_C" "C_A"
## [17] "T_A" "Homozygous Allele 2/Allele 2"
## [19] "II" "DI"
## [21] "DD"
# shows that there are some with values "Homozygous Allele 2/Allele 2", II", "DI", and "DD". They must have caused the errors. Find out which rows they are in
df <- t(apply(ped, 1, function(x) grepl("II|DI|DD|Homozygous Allele 2/Allele 2", x[5:length(x)])))
df2 <- which(df == "TRUE", arr.ind = T)
unique(df2[, 1]) # to get the rows (genes) that contain the above strings
## [1] 1522 1720 1725
ped <- ped[-unique(df2[, 1]), ]# remove those rows
# noticed that the row names are not re-adjusted, so we'll do that manually
rownames(ped) <- seq(1:nrow(ped))
# PLINK revealed another error - AIBL.Id 74 has a third allele for rs2247856 - convert to NA
a <- grep("rs2247856", ped$snp)
b <- grep("X74", colnames(ped))
ped[a, b] <- NA
# After trying to run it again, PLINK revealed a further error - we need to find which rows (SNPs) have all NA and remove
na_gns <- apply(ped[, 5:ncol(ped)], 1, function(x) length(which(is.na(x) == T)) == length(5:ncol(ped)))
length(which(na_gns == T)) # shows that 40 SNPs have no data
## [1] 40
ped <- ped[-which(na_gns == T), ] # remove
Create map file
map <- data.frame(ped[, 1:4], stringsAsFactors = F)
map <- dplyr::select(map, chromosome, snp, position)
map$chromosome <- as.character(map$chromosome)
map$snp <- as.character(map$snp)
# check that there are no unexpected values
unique(map$chromosome)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [15] "15" "16" "17" "18" "19" "20" "21" "22" "X" NA
# noticed that there is an empty row - remove from map and ped file
map <- map[-which(is.na(map$chromosome)), ]
ped <- ped[-which(is.na(ped$chromosome)), ]
# replace chr "X" with "23" as required by format
map$chromosome[map$chromosome == "X"] <- "23"
unique(map$chromosome)
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14"
## [15] "15" "16" "17" "18" "19" "20" "21" "22" "23"
Find duplicated SNP entries in the PED and determine whether all the genotype data is also duplicated (in which case we can delete).
# how many of the SNPs are unique?
length(unique(map$snp))
## [1] 2045
# get duplicated rs IDs from MAP file
map$snp[which(duplicated(map$snp))]
# create new data frame with genotype data from PED file matching those rs IDs
dupe <- ped[ped$snp %in% map$snp[which(duplicated(map$snp))], ]
# By visualising them, we see that an individual's entries for the duplicated SNP can be different. JD suggested to remove them all.
map <- map[!map$snp %in% dupe$snp, ] # remove from map
ped <- ped[!ped$snp %in% dupe$snp, ] # remove from ped
Write map file
map <- data.frame(lapply(map, function(x){
gsub(" ", "", x)
}), stringsAsFactors = F)
# write.table(map, "~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/snp_data.map", row.names = F, col.names = F, quote = F, sep = "\t")
Reformat ped into PED format
ped2 <- ped[, -c(1:4)] # remove extraneous columns
meta <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/key_metadata.txt", header = T) # read in metadata
colnames(ped2) <- gsub("X", "", colnames(ped2)) # remove "X" from colnames
ped2 <- data.frame(t(ped2), stringsAsFactors = F) # transpose ped2
ped2$AIBL.Id <- rownames(ped2) # make a new column with the row names (AIBL Ids) to use to bind with metadata
meta$AIBL.Id <- as.character(meta$AIBL.Id) # convert to characters to allow binding
library(dplyr)
ped2 <- left_join(ped2, meta, by = "AIBL.Id")
ped3 <- dplyr::select(ped2, AIBL.Id, Demographic.Sex, PET, everything())
ped3[, grep("Age|apoe4", colnames(ped3), ignore.case = T)] <- NULL # remove Age and apoe4 columns
# convert male/female to 1/2
ped3$Demographic.Sex <- as.character(ped3$Demographic.Sex)
ped3$Demographic.Sex[ped3$Demographic.Sex == "Male"] <- "1"
ped3$Demographic.Sex[ped3$Demographic.Sex == "Female"] <- "2"
# convert PET status from POS/NEG to 1/0
ped3$PET <- as.character(ped3$PET)
ped3$PET[ped3$PET == "POS"] <- "2"
ped3$PET[ped3$PET == "NEG"] <- "1"
ped3$PET[is.na(ped3$PET)] <- "0"
# clean data
ped4 <- data.frame(lapply(ped3, function(x){
gsub(" ", "", x)
gsub("_", " ", x)
}), stringsAsFactors = F)
ped4[is.na(ped4)] <- "0 0"
## write file
# write.table(ped4, "~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/snp_data.ped", row.names = F, col.names = F, quote = F, sep = "\t")
Obtain reference allele from BioMart query, in accordance with: http://zzz.bwh.harvard.edu/plink/dataman.shtml#refallele
# read in table
snp_gns <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/aibl_snp_genes.txt", header = T)
ref_al <- dplyr::select(snp_gns, refsnp_id, minor_allele)
ref_al <- as.tibble(sapply(ref_al, as.character), stringsAsFactors = F)
ref_al <- ref_al[!duplicated(ref_al), ]
# write.table(ref_al, "~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/ref_alleles.txt", row.names = F, col.names = F, quote = F, sep = "\t")
I fed the files to PLINK using the following code: ./plink --file snp_data --map3 --noweb --no-fid --no-parents --reference-allele ref_alleles.txt --assoc and ./plink --file snp_data --map3 --noweb --no-fid --no-parents --reference-allele ref_alleles.txt --fisher, which created the output files ‘plink.assoc’ and ‘plink.assoc.fisher’.
Take a look at the files.
pk_ass <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/plink.assoc", na.strings = NA, header = T, fill = T)
pk_fsh <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/plink.assoc.fisher", na.strings = NA, header = T, fill = T)
length(which(pk_ass$P < 0.05))
## [1] 87
length(which(pk_fsh$P < 0.05))
## [1] 72
Shows that the Fisher exact test output had fewer significant SNPs, so we’ll prefer that output.
#create table of significant SNPs
sig_pk <- pk_fsh[(pk_fsh$P < 0.05) == T &!is.na(pk_fsh$P), ]
dim(sig_pk)
## [1] 72 9
# merge with gene ids from 'aible_snp_genes.txt' file
snp_gns <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/aibl_snp_genes.txt", sep = "\t", header = T, stringsAsFactors = F)
library(dplyr)
sig_pk <- left_join(sig_pk, snp_gns, by = c("SNP" = "refsnp_id"))
# note that some SNPs map to multiple genes
# how many unique genes?
length(unique(sig_pk$hgnc_symbol))
## [1] 59
length(unique(sig_pk$ensembl_gene_stable_id))
## [1] 80
length(unique(sig_pk$entrezgene))
## [1] 55
# order by P value
sig_pk <- sig_pk[order(sig_pk$P), ]
# rename and reorder columns
sig_pk <- dplyr::select(sig_pk,
SNP,
minor_allele = "A1",
freq_in_cases = "F_A",
freq_in_controls = "F_U",
major_allele = "A2",
p_value = "P",
odds_ratio_minor_allele = "OR",
chr = "chr_name",
snp_locus = "chrom_start",
gene_start = "start_position",
gene_end = "end_position",
strand, ensembl_id = "ensembl_gene_stable_id",
hgnc_symbol, entrez_id = "entrezgene")
## write table
# write.table(sig_pk, "~/Desktop/Uni Air/CSIRO/AD_project/Working/sig_PET_pos_v_neg_alleles_annotated.txt", col.names = T, row.names = F, quote = F, sep = "\t")
Combine data with probe id (by entrez id, which is the most annotated of gene ids)
probes <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/affy_probes_annotated.txt", header = T, sep = "\t")
sig_pk <- left_join(sig_pk, probes, by = "entrez_id")
# remove second ensembl id and gene symbol columns (they have no additional information)
sig_pk <- dplyr::select(sig_pk, -ensembl_id.y, -gene_symbol, ensembl_id = "ensembl_id.x")
# add NAs to hgnc_symbol column
sig_pk$hgnc_symbol[sig_pk$hgnc_symbol == ""] <- NA
sig_pk$hgnc_symbol <- as.character(sig_pk$hgnc_symbol)
# how many genes?
length(unique(sig_pk$hgnc_symbol))
## [1] 59
Write covariate file, in accordance with: http://zzz.bwh.harvard.edu/plink/data.shtml#covar
cov <- ped2[,(ncol(ped2) - 5 + 1):ncol(ped2)] # to obtain metadata bound to aibl IDs for which we have genotype data
cov$Demographic.Sex <- as.character(cov$Demographic.Sex)
cov$Demographic.Sex[cov$Demographic.Sex == "Male"] <- "1"
cov$Demographic.Sex[cov$Demographic.Sex == "Female"] <- "2"
colnames(cov)[colnames(cov) == "Demographic.Sex"] <- "Sex"
cov$PET <- NULL # because it's already in the PED file
cov$FID <- cov$AIBL.Id # to make a family ID column - see <http://zzz.bwh.harvard.edu/plink/data.shtml#ped>
cov <- dplyr::select(cov, FID, IID = "AIBL.Id", everything())
# write.table(cov, "~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/cov.txt", row.names = F, col.names = T, quote = F, sep = "\t")
Ran files on PLINK with the following command: ./plink --file snp_data --map3 --no-fid --no-parents --logistic --beta --reference-allele ref_alleles.txt --noweb --adjust --covar cov.txt
Read output
log <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/plink.assoc.logistic", header = T, sep = "")
log <- log[with(log, order(CHR, BP)), ] # sort by chromosome and bp
# Find SNPs that are significant indpendent of covariates
log2 <- dplyr::select(log[!is.na(log$P), ], -NMISS, -BETA, -STAT) %>%
tidyr::spread(key = TEST, value = P)
length(which(log2$ADD < 0.05)) # tells us how many SNPs were significant
## [1] 87
length(which(log2$Age < 0.05)) # tells us for how many SNPs Age was a significant covariate
## [1] 1826
length(which(log2$apoe4 < 0.05)) # tells us for how many SNPs the presence of the apoe4 allele was a significant covariate
## [1] 1826
length(which(log2$Sex < 0.05)) # tells us for how many SNPs sex was a significant covariate
## [1] 0
This means there apoe4 and age were significant covariates in all but two SNPs.
Let’s pull out the SNPs with ADD < 0.05 and annotate them.
# pull out SNPs that are significant
sig_snps_log <- log[log$SNP %in% log2$SNP[which(log2$ADD < 0.05)] & log$TEST == "ADD", ]
# attach annotation data
sig_snps_log <- dplyr::left_join(sig_snps_log, snp_gns, by = c("SNP" = "refsnp_id"))
# how many significant SNPs have been annotated with gene symbols?
sig_snps_log$hgnc_symbol <- as.character(sig_snps_log$hgnc_symbol) # convert to character
sig_snps_log$hgnc_symbol[sig_snps_log$hgnc_symbol == ""] <- NA # replace blank values with NA
sig_snp_symb <- sig_snps_log[is.na(sig_snps_log$hgnc_symbol) == F, ] # create new data frame with SNPs that have been annotated with gene symbols
# get unique values
length(unique(sig_snp_symb$SNP))
## [1] 83
length(unique(sig_snp_symb$hgnc_symbol))
## [1] 59
See which SNPs are significant below the FDR corrected p-value
# read adjust file
log_adj <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/plink.assoc.logistic.adjusted", header = T, sep = "")
nrow(log_adj[log_adj$FDR_BH < 0.05, ])
## [1] 0
Therefore there are no SNPs below the FDR corrected p-value.
Find which SNPs and genes are signficant in both the association and logistic regression analyses
length(intersect(sig_snps_log$SNP, sig_pk$SNP))
intersect(sig_snps_log$hgnc_symbol, sig_pk$hgnc_symbol) # genes
## [1] "MTOR" NA "PPP3R1" "BIN1" "EPC2"
## [6] "FYCO1" "CXCR6" "TF" "TFP1" "HPS3"
## [11] "CP" "SKIV2L2" "PLPP1" "AK4P2" "MTHFD1L"
## [16] "ASAH1" "GULOP" "CLU" "CDKN2B-AS1" "NTRK2"
## [21] "NPY4R2" "SGMS1" "CALHM1" "SORCS1" "LDLRAD3"
## [26] "GRIN2B" "MTHFD1" "AKAP5" "ZBTB25" "KLC1"
## [31] "SLC30A4" "HMGN2P46" "MIR4713HG" "CYP19A1" "VPS35"
## [36] "SMPD3" "MAPT" "ABCA7" "ARHGAP45" "FAM83E"
## [41] "RPL18" "SPHK2" "CA11" "SEC1P" "DBP"
## [46] "PLTP" "DNMT3L"
See whether any genes cited in Apostolova et al. (2018) Associations of the top 20 Alzheimer Disease risk variants with brain amyloidosis are in our significant SNPs list.
# first check how many hits we get with these genes from the original SNPs list
nrow(snp_gns[grep("PICALM|BIN1|CR1|ABCA7|MS4A6A|CLU|FERMT2|DSG2|EPHA1|SORL1|ZCWPW1", snp_gns$hgnc_symbol), ])
## [1] 148
# now check how many hits we get with our significant SNPs list
sig_snps_log[grep("PICALM|BIN1|CR1|ABCA7|MS4A6A|CLU|FERMT2|DSG2|EPHA1|SORL1|ZCWPW1", sig_snps_log$hgnc_symbol), ]
## CHR SNP BP A1 TEST NMISS BETA STAT P
## 14 2 rs6709337 127844735 C ADD 160 0.5654 1.968 0.049020
## 50 8 rs17057438 27450073 A ADD 161 -0.7984 -2.796 0.005172
## 52 8 rs10101779 27450535 T ADD 161 0.7990 2.800 0.005116
## 53 8 rs9331930 27458294 G ADD 161 -0.8782 -3.006 0.002643
## 54 8 rs7812347 27460185 A ADD 161 -0.9089 -3.063 0.002189
## 89 11 rs680119 85716032 C ADD 161 -0.5757 -2.051 0.040240
## 130 19 rs4147937 1065677 A ADD 162 -1.2890 -2.131 0.033050
## allele_1 minor_allele minor_allele_freq chr_name chrom_start chrom_end
## 14 C A 0.416733 2 127087159 127087159
## 50 A A 0.416534 8 27592556 27592556
## 52 C NA 8 27593018 27593018
## 53 G G 0.263578 8 27600777 27600777
## 54 A A 0.263978 8 27602668 27602668
## 89 T T 0.332268 11 86004989 86004989
## 130 G A 0.091254 19 1065678 1065678
## ensembl_gene_stable_id chromosome_name start_position end_position
## 14 ENSG00000136717 2 127048027 127107355
## 50 ENSG00000120885 8 27596917 27615031
## 52 ENSG00000120885 8 27596917 27615031
## 53 ENSG00000120885 8 27596917 27615031
## 54 ENSG00000120885 8 27596917 27615031
## 89 ENSG00000073921 11 85957684 86069882
## 130 ENSG00000064687 19 1040101 1065572
## strand hgnc_symbol entrezgene
## 14 -1 BIN1 274
## 50 -1 CLU 1191
## 52 -1 CLU 1191
## 53 -1 CLU 1191
## 54 -1 CLU 1191
## 89 -1 PICALM 8301
## 130 1 ABCA7 10347
## write file
# write.table(sig_snps_log, "~/Desktop/Uni Air/CSIRO/AD_project/Working/sig_PET_pos_v_neg_snps.txt", row.names = F, col.names = T, quote = F, sep = "\t")
Four genes: BIN1, CLU, PICALM, and ABCA7.
Ran PLINK again for association, but this time with 10,000 permutations and the reference allele file: ./plink --file snp_data --map3 --noweb --no-fid --no-parents --assoc --mperm 10000 --reference-allele ref_alleles.txt. Output two files:
Take a look again, and compare to the logistic regression output
ass <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/plink.assoc", na.strings = NA, header = T, fill = T)
sig_ass <- ass[ass$P < 0.05 & is.na(ass$P) == F, ]
perm <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/plink.assoc.mperm", na.strings = NA, header = T, fill = T)
perm[which(perm$EMP2 < 0.7), ] # see how many SNPs had an FDR-adjusted p-value of less than 0.7
## CHR SNP EMP1 EMP2
## 237 2 rs16829119 0.0005999 0.6764
nrow(perm[which(perm$EMP2 < 0.05), ])
## [1] 0
Only one SNP has a corrected p-value of less than 0.7! Therefore not persuasive evidence of an effect.
Read in data.
exdata <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression.txt", sep = "", header = T)
ids2 <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression_IDs_UpdtdDec2017.txt", sep = " ")
colnames(exdata) <- as.vector(ids2$x) # replace colnames with ids2
meta <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/key_metadata.txt", header = T, sep = "\t")
MatrixEQTL package.# create binary statuses for sex and PET
meta$Demographic.Sex <- ifelse(meta$Demographic.Sex == "Male", 0,
ifelse(meta$Demographic.Sex == "Female", 1, NA))
meta$PET <- ifelse(meta$PET == "POS", 1,
ifelse(meta$PET == "NEG", 0, NA))
# tidy metadata
meta <- data.frame(t(meta), stringsAsFactors = F)
colnames(meta) <- meta[1, ]
meta$id <- rownames(meta)
rownames(meta) <- seq(1:nrow(meta))
meta <- dplyr::select(meta, id, dplyr::everything())
meta <- meta[-1, ]
# tidy probe data
exdata$probe_id <- rownames(exdata)
exdata <- dplyr::select(exdata, probe_id, dplyr::everything())
rownames(exdata) <- seq(1:nrow(exdata))
# tidy genotype data
ped6 <- ped[, -c(1:3)]
ped6 <- left_join(ped6, dplyr::select(snp_gns[!duplicated(snp_gns$refsnp_id), ], refsnp_id, allele_1, minor_allele), by = c("snp" = "refsnp_id")) %>%
dplyr::select(snp, allele_1, minor_allele, everything())
ped6 <- data.frame(lapply(ped6, function(x) gsub("_", " ", x)),
stringsAsFactors = F)
Noticed that for some SNPs the alleles annotated with BioMart differ from the alleles present in our data (e.g. “rs1064261” is annotated as having A and G alleles, but our data shows T and C), and some SNPs are annotated as having the same major and minor alleles (e.g. “rs3753584” which has C for both its major and minor allele).
We will therefore get another plink output to find calculate which alleles are major and minor when they differ from the BioMart annotations, or when the BioMart annotations seem incorrect. We used the code: ./plink --file snp_data --map3 --noweb --no-fid --no-parents --fisher --out allele_freq
# import plink data on allele frequencies
pk_frq <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/plink/allele_freq.assoc.fisher", na.strings = NA, header = T, fill = T)
pk_frq <- dplyr::select(pk_frq, snp = SNP, pk_major_allele = A2, pk_minor_allele = A1)
# create comparison data frame
comp <- data.frame(left_join(dplyr::select(ped6, snp, allele_1, minor_allele), pk_frq, by = "snp") %>%
dplyr::select(snp, allele_1, pk_major_allele, minor_allele, pk_minor_allele), stringsAsFactors = F)
comp <- data.frame(sapply(comp, as.character), stringsAsFactors = F)
# create empty columns for correct allele calls
comp$cor_mj <- NA
comp$cor_mn <- NA
# substitute empty values with NA
comp[, 2:5] <- sapply(comp[, 2:5], function(x) gsub("0", NA, x))
comp[, 2:5] <- sapply(comp[, 2:5], function(x) gsub("^$|^ $", NA, x))
# add correct major allele column
comp$cor_mj <- unlist(apply(comp, 1, function(x){
if (is.na(x[2]) == T & is.na(x[3]) == T){ # if both are NA, return NA
x[6] <- NA
} else if (x[2] == x[5] & x[4] == x[3] & is.na(x[2] == F) & is.na(x[3] == F)){ # if annotated and plink alleles are swapped around, prefer annotated alleles
x[6] <- x[2]
} else if (is.na(x[2]) == T & is.na(x[3] == F)){ # if only plink output has an allele, use that
x[6] <- x[3]
} else if (is.na(x[2]) == F & is.na(x[3] == T)){ # if only the annotation has an allele, use that
x[6] <- x[2]
} else if (x[2] == x[3] & is.na(x[2] == F) & is.na(x[3] == F)) { # if the annotated and plink alleles are the same, just use the annotated allele
x[6] <- x[2]
} else { # otherwise use the plink allele
x[6] <- x[3]
}
}))
# add correct minor allele column
comp$cor_mn <- unlist(apply(comp, 1, function(x){
if (is.na(x[4]) == T & is.na(x[5]) == T){
x[7] <- NA
} else if (x[2] == x[5] & x[4] == x[3] & is.na(x[2] == F) & is.na(x[3] == F)){
x[7] <- x[4]
} else if (is.na(x[4]) == T & is.na(x[5] == F)){
x[7] <- x[5]
} else if (is.na(x[4]) == F & is.na(x[5] == T)){
x[7] <- x[4]
} else if (x[4] == x[5] & is.na(x[4]) == F & is.na(x[5]) == F) {
x[7] <- x[4]
} else {
x[7] <- x[5]
}
}))
Incorporate adjusted major and minor allele into ped6 and create genotype where: * 0 = homozygous major allele * 1 = heterozygous * 2 = homozygous minor allele
ped6$allele_1 <- comp$cor_mj
ped6$minor_allele <- comp$cor_mn
out <- t(data.frame(apply(ped6, 1, function(x){ ## WORKS - need to change the second function from sapply to lapply
a <- paste(x[2], x[2], sep = " ")
b <- paste(x[2], x[3], sep = " ")
c <- paste(x[3], x[2], sep = " ")
d <- paste(x[3], x[3], sep = " ")
results <- lapply(x[4:length(x)], function(y){
if (is.na(y) == T){
y <- NA
}
else if (y == a & is.na(y) == F){
y <- 0
}
else if (y == b | y == c & is.na(y) == F){
y <- 1
}
else if (y == d & is.na(y) == F){
y <- 2
}
})
data.frame(do.call(rbind, results))
})))
# bind snp info to genotype coding
ped7 <- data.frame(cbind(ped6[, 1], out), stringsAsFactors = F)
rownames(ped7) <- seq(1:nrow(ped7))
ped7 <- dplyr::rename(ped7, snpid = V1)
colnames(ped7) <- gsub("X", "", colnames(ped7))
exdata <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression.txt", sep = "", header = T)
ids2 <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression_IDs_UpdtdDec2017.txt", sep = " ")
colnames(exdata) <- as.vector(ids2$x) # replace colnames with ids2
# extract only those columns (samples) for which we have genotype data
exdata <- exdata[, colnames(exdata) %in% colnames(ped7)]
# create column for probe ID and reorder
exdata$probe_id <- rownames(exdata)
exdata <- dplyr::select(exdata, probe_id, everything())
Finalise covariates file by extracting only those columns (samples) for which we have genotype data
cns <- c("id", colnames(meta)[colnames(meta) %in% colnames(ped7)])
meta <- meta[, cns]
Write files
# write.table(ped7, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/snps.txt", col.names = T, row.names = F, quote = F, sep = "\t")
# write.table(meta, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/covs.txt", col.names = T, row.names = F, quote = F, sep = "\t")
# write.table(exdata, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/expr.txt", col.names = T, row.names = F, quote = F, sep = "\t")
library(MatrixEQTL)
useModel = modelLINEAR
SNP_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/snps.txt"
expression_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/expr.txt"
covariates_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/covs.txt"
output_file_name = tempfile()
pvOutputThreshold = 1e-2
errorCovariance = numeric()
Load snps data
snps = SlicedData$new()
snps$fileDelimiter = "\t"
snps$fileOmitCharacters = "NA"
snps$fileSkipRows = 1
snps$fileSkipColumns = 1
snps$fileSliceSize = 2000
snps$LoadFile( SNP_file_name )
Load gene data
gene = SlicedData$new()
gene$fileDelimiter = "\t"
gene$fileOmitCharacters = "NA"
gene$fileSkipRows = 1
gene$fileSkipColumns = 1
gene$fileSliceSize = 2000
gene$LoadFile( expression_file_name )
Load covariate data
cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"
cvrt$fileOmitCharacters = "NA"
cvrt$fileSkipRows = 1
cvrt$fileSkipColumns = 1
cvrt$fileSliceSize = 2000
cvrt$LoadFile( covariates_file_name )
me = Matrix_eQTL_engine(
snps = snps,
gene = gene,
cvrt = cvrt,
output_file_name = output_file_name,
pvOutputThreshold = pvOutputThreshold,
useModel = useModel,
errorCovariance = errorCovariance,
verbose = TRUE,
pvalue.hist = TRUE,
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE)
# create table of annotated significant genes (from original limma)
sig_PET_pos_v_neg <- left_join(signif_PET_pos_v_neg, pid_all, by = "probe_id")
# read in annotated probe table
pid_all <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/affy_probes_annotated.txt", header = T, sep = "\t", stringsAsFactors = F)
# pull out eQTLs from output
eqtls <- me[["all"]][["eqtls"]]
# pull out significant SNP-gene relationships (FDR < 0.05)
sig_eqtls <- eqtls[eqtls$FDR < 0.05, ]
dim(sig_eqtls)
## [1] 3055 6
# number of unique SNPs
length(unique(sig_eqtls$snps))
## [1] 130
# number of unique probes
length(unique(sig_eqtls$gene))
## [1] 2976
# annotate
sig_eqtls$snps <- as.character(sig_eqtls$snps)
sig_eqtls$gene <- as.character(sig_eqtls$gene)
pid_all$probe_id <- as.character(pid_all$probe_id)
sig_eqtl_snps <- dplyr::left_join(sig_eqtls, snp_gns, by = c("snps" = "refsnp_id"))
sig_eqtl_gns <- dplyr::left_join(sig_eqtls, pid_all, by = c("gene" = "probe_id"))
## write files
# write.table(sig_eqtl_snps, "~/Desktop/Uni Air/CSIRO/AD_project/Working/eqtl_snps.txt", row.names = F, col.names = T, quote = F, sep = "\t")
# write.table(sig_eqtl_gns, "~/Desktop/Uni Air/CSIRO/AD_project/Working/eqtl_gns.txt", row.names = F, col.names = T, quote = F, sep = "\t")
# find any hits with the genes implicated in the 2018 study
nrow(sig_eqtl_snps[grep("PICALM|BIN1|CR1|ABCA7|MS4A6A|CLU|FERMT2|DSG2|EPHA1|SORL1|ZCWPW1", sig_eqtl_snps$hgnc_symbol), ])
## [1] 20
nrow(sig_eqtl_gns[grep("PICALM|BIN1|CR1|ABCA7|MS4A6A|CLU|FERMT2|DSG2|EPHA1|SORL1|ZCWPW1", sig_eqtl_gns$hgnc_symbol), ])
## [1] 0
# see whether any of the SNP genes intersect with the expression genes (i.e. are any of them acting cis?)
eqtl_snp_gns <- as.vector(na.omit(unique(sig_eqtl_snps$hgnc_symbol)))
length(eqtl_snp_gns)
## [1] 101
eqtl_de_gns <- as.vector(na.omit(unique(sig_eqtl_gns$gene_symbol)))
length(eqtl_de_gns)
## [1] 2246
intersect(eqtl_snp_gns, eqtl_de_gns)
## [1] "HSD11B2" "CYP21A2" "ACE" "TXNL4B" "ICAM5" "IDE" "SLC30A3"
## [8] "SGMS1"
130 significant AIBL SNPs in 101 annotated genes, affecting the expression levels of 2242 genes. Only eight of them are shared between the SNP genes and the differentially expressed genes, meaning that only those appear to be acting cis.
# create covariates file for pos status
PET_pos_covs <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/covs.txt", sep = "\t", header = T)
PET_pos_covs <- data.frame(t(PET_pos_covs), stringsAsFactors = F)
colnames(PET_pos_covs) <- PET_pos_covs[1, ]
PET_pos_covs$aibl_id <- gsub("X", "", rownames(PET_pos_covs))
PET_pos_covs$PET <- as.integer((PET_pos_covs$PET))
PET_pos_covs <- dplyr::filter(PET_pos_covs, PET == 1)
PET_pos_covs <- data.frame(t(PET_pos_covs), stringsAsFactors = F)
colnames(PET_pos_covs) <- PET_pos_covs[nrow(PET_pos_covs), ]
PET_pos_covs <- PET_pos_covs[-c(2, nrow(PET_pos_covs)), ] # remove PET and id rows
PET_pos_covs$id <- rownames(PET_pos_covs)
PET_pos_covs <- dplyr::select(PET_pos_covs, id, everything())
rownames(PET_pos_covs) <- NULL
# write.table(PET_pos_covs, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_covs.txt", col.names = T, row.names = F, quote = F, sep = "\t")
# create covariates file for neg status
PET_neg_covs <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/covs.txt", sep = "\t", header = T)
PET_neg_covs <- data.frame(t(PET_neg_covs), stringsAsFactors = F)
colnames(PET_neg_covs) <- PET_neg_covs[1, ]
PET_neg_covs$aibl_id <- gsub("X", "", rownames(PET_neg_covs))
PET_neg_covs$PET <- as.integer((PET_neg_covs$PET))
PET_neg_covs <- dplyr::filter(PET_neg_covs, PET == 0)
PET_neg_covs <- data.frame(t(PET_neg_covs), stringsAsFactors = F)
colnames(PET_neg_covs) <- PET_neg_covs[nrow(PET_neg_covs), ]
PET_neg_covs <- PET_neg_covs[-c(2, nrow(PET_neg_covs)), ]
PET_neg_covs$id <- rownames(PET_neg_covs)
PET_neg_covs <- dplyr::select(PET_neg_covs, id, everything())
rownames(PET_neg_covs) <- NULL
# write.table(PET_neg_covs, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_covs.txt", col.names = T, row.names = F, quote = F, sep = "\t")
## Get expression data
exdata <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression.txt", sep = "", header = T)
ids2 <- read.delim("~/Dropbox/eQTL/Data/AIBL_expression_set/AIBL_Gene_Expression_IDs_UpdtdDec2017.txt", sep = " ")
colnames(exdata) <- as.vector(ids2$x) # replace colnames with ids2
exdata <- exdata[, colnames(exdata) %in% colnames(ped7)] # extract only those columns (samples) for which we have genotype data
# for POS group
pos_exdata <- exdata[, colnames(exdata) %in% colnames(PET_pos_covs)]
pos_exdata$probe_id <- rownames(pos_exdata) # create column for probe ID and reorder
pos_exdata <- dplyr::select(pos_exdata, probe_id, everything())
# write.table(pos_exdata, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_expr.txt", col.names = T, row.names = F, quote = F, sep = "\t")
# for NEG group
neg_exdata <- exdata[, colnames(exdata) %in% colnames(PET_neg_covs)]
neg_exdata$probe_id <- rownames(neg_exdata) # create column for probe ID and reorder
neg_exdata <- dplyr::select(neg_exdata, probe_id, everything())
# write.table(neg_exdata, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_expr.txt", col.names = T, row.names = F, quote = F, sep = "\t")
# read in data
gtps <- read.delim("~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/snps.txt", header = T, sep = "\t")
colnames(gtps) <- gsub("X", "", colnames(gtps))
# for POS group
pet_pos_gtps <- gtps[, colnames(gtps) %in% colnames(PET_pos_covs)]
pet_pos_gtps <- cbind(snpid = gtps[, 1], pet_pos_gtps)
# write.table(pet_pos_gtps, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_gtps.txt", col.names = T, row.names = F, quote = F, sep = "\t")
# for NEG group
pet_neg_gtps <- gtps[, colnames(gtps) %in% colnames(PET_neg_covs)]
pet_neg_gtps <- cbind(snpid = gtps[, 1], pet_neg_gtps)
# write.table(pet_neg_gtps, "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_gtps.txt", col.names = T, row.names = F, quote = F, sep = "\t")
Set up parameters for eQTL
library(MatrixEQTL)
useModel = modelLINEAR
SNP_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_gtps.txt"
expression_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_expr.txt"
covariates_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_pos_covs.txt"
output_file_name = tempfile()
pvOutputThreshold = 1e-2
errorCovariance = numeric()
Load snps data
snps = SlicedData$new()
snps$fileDelimiter = "\t"
snps$fileOmitCharacters = "NA"
snps$fileSkipRows = 1
snps$fileSkipColumns = 1
snps$fileSliceSize = 2000
snps$LoadFile( SNP_file_name )
Load gene data
gene = SlicedData$new()
gene$fileDelimiter = "\t"
gene$fileOmitCharacters = "NA"
gene$fileSkipRows = 1
gene$fileSkipColumns = 1
gene$fileSliceSize = 2000
gene$LoadFile( expression_file_name )
Load covariate data
cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"
cvrt$fileOmitCharacters = "NA"
cvrt$fileSkipRows = 1
cvrt$fileSkipColumns = 1
cvrt$fileSliceSize = 2000
cvrt$LoadFile( covariates_file_name )
Run Matrix eQTL on PET-pos group
me_pos = Matrix_eQTL_engine(
snps = snps,
gene = gene,
cvrt = cvrt,
output_file_name = output_file_name,
pvOutputThreshold = pvOutputThreshold,
useModel = useModel,
errorCovariance = errorCovariance,
verbose = TRUE,
pvalue.hist = TRUE,
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE)
Set up parameters for eQTL
library(MatrixEQTL)
useModel = modelLINEAR
SNP_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_gtps.txt"
expression_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_expr.txt"
covariates_file_name = "~/Desktop/Uni Air/CSIRO/AD_project/Working/matrixeqtl/pet_neg_covs.txt"
output_file_name = tempfile()
pvOutputThreshold = 1e-2
errorCovariance = numeric()
Load snps data
snps = SlicedData$new()
snps$fileDelimiter = "\t"
snps$fileOmitCharacters = "NA"
snps$fileSkipRows = 1
snps$fileSkipColumns = 1
snps$fileSliceSize = 2000
snps$LoadFile( SNP_file_name )
Load gene data
gene = SlicedData$new()
gene$fileDelimiter = "\t"
gene$fileOmitCharacters = "NA"
gene$fileSkipRows = 1
gene$fileSkipColumns = 1
gene$fileSliceSize = 2000
gene$LoadFile( expression_file_name )
Load covariate data
cvrt = SlicedData$new()
cvrt$fileDelimiter = "\t"
cvrt$fileOmitCharacters = "NA"
cvrt$fileSkipRows = 1
cvrt$fileSkipColumns = 1
cvrt$fileSliceSize = 2000
cvrt$LoadFile( covariates_file_name )
Run Matrix eQTL on PET-neg group
me_neg = Matrix_eQTL_engine(
snps = snps,
gene = gene,
cvrt = cvrt,
output_file_name = output_file_name,
pvOutputThreshold = pvOutputThreshold,
useModel = useModel,
errorCovariance = errorCovariance,
verbose = TRUE,
pvalue.hist = TRUE,
min.pv.by.genesnp = FALSE,
noFDRsaveMemory = FALSE)
# pull out eQTLs from output
pos_eqtls <- me_pos[["all"]][["eqtls"]]
# pull out significant SNP-gene relationships (FDR < 0.05)
pos_sig_eqtls <- pos_eqtls[pos_eqtls$FDR < 0.05, ]
dim(pos_sig_eqtls)
## [1] 121 6
# annotate
pos_sig_eqtl_snps <- left_join(pos_sig_eqtls, snp_gns, by = c("snps" = "refsnp_id"))
pos_sig_eqtl_gns <- left_join(pos_sig_eqtls, pid_all, by = c("gene" = "probe_id"))
## write files
# write.table(pos_sig_eqtl_snps, "~/Desktop/Uni Air/CSIRO/AD_project/Working/pos_eqtl_snps.txt", row.names = F, col.names = T, quote = F, sep = "\t")
# write.table(pos_sig_eqtl_gns, "~/Desktop/Uni Air/CSIRO/AD_project/Working/pos_eqtl_gns.txt", row.names = F, col.names = T, quote = F, sep = "\t")
# number of unique SNPs
length(unique(pos_sig_eqtl_snps$snps))
## [1] 28
# number of unique genes the SNPs sit within (note that it includes one NA)
length(unique(as.character(pos_sig_eqtl_snps$hgnc_symbol)))
## [1] 32
# number of unique differentially expressed genes (note that it includes one NA)
length(unique(pos_sig_eqtl_gns$gene_symbol))
## [1] 49
# create vectors of gene names for snps and differentially expressed genes
pos_eqtl_snp_gns <- as.vector(na.omit(unique(pos_sig_eqtl_snps$hgnc_symbol)))
pos_eqtl_snp_gns
## [1] "MIR6796" "PLD3" "FYCO1" "MIR6165" "NGFR" "NEDD9"
## [7] "CLU" "TRDMT1" "SGMS1" "HP" "HPR" "TXNL4B"
## [13] "NPY4R2" "APOC1" "OTC" "XRCC3" "KLC1" "ATAT1"
## [19] "C6orf136" "LRP8" "DNMT3B" "DNMT1" "CYP11B2" "GML"
## [25] "MEI1" "AKR1C3" "CD33" "PPARA" "TFRC" "NPC1"
## [31] "C18orf8"
pos_eqtl_de_gns <- as.vector(na.omit(unique(pos_sig_eqtl_gns$gene_symbol)))
pos_eqtl_de_gns
## [1] "ADGRG7" "RBM46" "DLGAP5" "TTK" "HMMR" "FST" "KIF15"
## [8] "BUB1" "ASPM" "PBK" "MCM10" "KIF14" "POLQ" "MND1"
## [15] "CDC6" "CEP55" "CLSPN" "DTL" "CCNB2" "CCNA2" "PCLAF"
## [22] "CCNB1" "KIF11" "CENPF" "MCM4" "MKI67" "EXO1" "GLYATL2"
## [29] "NUF2" "ANLN" "KIF23" "CA1" "STIL" "CDCA2" "APOD"
## [36] "PLK1" "TRIP13" "GTF3C3" "GFM2" "BIRC5" "CDC45" "NCAPH"
## [43] "AHSP" "NCAPG" "RRM2" "E2F8" "OR1E2" "ART1"
# find which ones are shared
intersect(pos_eqtl_snp_gns, pos_eqtl_de_gns)
## character(0)
28 SNPs in 31 unique genes (32 less NA) affecting 48 unique genes (49 less NA). None of those genes are shared.
# pull out eQTLs from output
neg_eqtls <- me_neg[["all"]][["eqtls"]]
# pull out significant SNP-gene relationships (FDR < 0.05)
neg_sig_eqtls <- neg_eqtls[neg_eqtls$FDR < 0.05, ]
dim(neg_sig_eqtls)
## [1] 160 6
head(neg_sig_eqtls)
## snps gene statistic pvalue FDR beta
## 1 rs5030400 2633737 30.58395 9.851831e-44 4.302277e-36 3.793769
## 2 rs1937 3343900 17.69020 2.686827e-28 5.866663e-21 1.995096
## 3 rs1937 3514711 16.75808 6.696144e-27 9.747314e-20 2.041898
## 4 rs1937 3475130 14.92243 5.167194e-24 5.641261e-17 1.259226
## 5 rs1937 2557585 14.17949 8.604028e-23 7.514728e-16 1.884516
## 6 rs1937 3712197 11.93355 6.458826e-19 4.700930e-12 1.313909
# number of unique SNPs
length(unique(neg_sig_eqtls$snps))
## [1] 35
# annotate
neg_sig_eqtl_snps <- left_join(neg_sig_eqtls, snp_gns, by = c("snps" = "refsnp_id"))
neg_sig_eqtl_gns <- left_join(neg_sig_eqtls, pid_all, by = c("gene" = "probe_id"))
## write files
# write.table(neg_sig_eqtl_snps, "~/Desktop/Uni Air/CSIRO/AD_project/Working/neg_eqtl_snps.txt", row.names = F, col.names = T, quote = F, sep = "\t")
# write.table(neg_sig_eqtl_gns, "~/Desktop/Uni Air/CSIRO/AD_project/Working/neg_eqtl_gns.txt", row.names = F, col.names = T, quote = F, sep = "\t")
# create vectors of gene names for snps and differentially expressed genes
neg_eqtl_snp_gns <- as.vector(na.omit(unique(neg_sig_eqtl_snps$hgnc_symbol)))
length(neg_eqtl_snp_gns)
## [1] 34
neg_eqtl_snp_gns
## [1] "ICAM5" "ICAM1" "ICAM4" "TFAM" "PPP3CB"
## [6] "EPHA1" "MME" "PPARA" "NPC1" "MRPS18B"
## [11] "PTMAP1" "ATAT1" "MTHFD1L" "DNAJC5G" "SLC30A3"
## [16] "BIN1" "MICAL1" "PYY" "BEST1" "FTH1"
## [21] "CR1" "CDKN2B-AS1" "CLU" "IL6" "CYP11B2"
## [26] "GML" "HSD17B3-AS1" "HSD17B3" "PSAP" "EPC2"
## [31] "ACO1" "UCP3" "MTHFR" "C1orf167"
neg_eqtl_de_gns <- as.vector(na.omit(unique(neg_sig_eqtl_gns$gene_symbol)))
length(neg_eqtl_de_gns)
## [1] 74
neg_eqtl_de_gns
## [1] "ADGRG7" "CTAGE3P" "EFHC1" "UTS2" "L1TD1"
## [6] "TBRG1" "ATP5L" "CHRDL2" "FAM20A" "SLTM"
## [11] "TUG1" "SNRPA1" "ERP27" "IL31RA" "HCLS1"
## [16] "RBM25" "PRRG1" "PPP6R3" "RUNDC3B" "ENTPD4"
## [21] "ZFC3H1" "DPP10" "CDK12" "MED4" "MYSM1"
## [26] "HOXA1" "R3HDM2" "SECISBP2" "EZH1" "HINT1"
## [31] "NEK1" "OR1C1" "ZMYM2" "RB1CC1" "RSRC2"
## [36] "KIAA1468" "GSN" "ZBTB1" "SLC26A5" "TBC1D15"
## [41] "SRSF5" "RPS15A" "CNTRL" "DUSP19" "MDM4"
## [46] "LUC7L3" "PDE3B" "FN1" "NAPG" "IFNAR2"
## [51] "ILF3-AS1" "POLI" "PDCD4" "MGEA5" "ZFP1"
## [56] "NSD3" "PNISR" "TRA2A" "SERPINI2" "STN1"
## [61] "TIGD7" "PAN3" "ESM1" "GGNBP2" "XPOT"
## [66] "NEMF" "AADACL2" "HIST1H2BE" "LYAR" "ACAT2"
## [71] "NAP1L2" "NKTR" "RFC1" "TTC17"
# find which ones are shared
intersect(neg_eqtl_snp_gns, neg_eqtl_de_gns)
## character(0)
35 SNPs in 34 unique genes affecting the expression of 74 unique genes. None of those genes are shared (i.e. all the SNPs are acting cis.
setdiff(pos_eqtl_de_gns, neg_eqtl_de_gns)
## [1] "RBM46" "DLGAP5" "TTK" "HMMR" "FST" "KIF15" "BUB1"
## [8] "ASPM" "PBK" "MCM10" "KIF14" "POLQ" "MND1" "CDC6"
## [15] "CEP55" "CLSPN" "DTL" "CCNB2" "CCNA2" "PCLAF" "CCNB1"
## [22] "KIF11" "CENPF" "MCM4" "MKI67" "EXO1" "GLYATL2" "NUF2"
## [29] "ANLN" "KIF23" "CA1" "STIL" "CDCA2" "APOD" "PLK1"
## [36] "TRIP13" "GTF3C3" "GFM2" "BIRC5" "CDC45" "NCAPH" "AHSP"
## [43] "NCAPG" "RRM2" "E2F8" "OR1E2" "ART1"
length(setdiff(pos_eqtl_de_gns, neg_eqtl_de_gns))
## [1] 47
setdiff(neg_eqtl_de_gns, pos_eqtl_de_gns)
## [1] "CTAGE3P" "EFHC1" "UTS2" "L1TD1" "TBRG1"
## [6] "ATP5L" "CHRDL2" "FAM20A" "SLTM" "TUG1"
## [11] "SNRPA1" "ERP27" "IL31RA" "HCLS1" "RBM25"
## [16] "PRRG1" "PPP6R3" "RUNDC3B" "ENTPD4" "ZFC3H1"
## [21] "DPP10" "CDK12" "MED4" "MYSM1" "HOXA1"
## [26] "R3HDM2" "SECISBP2" "EZH1" "HINT1" "NEK1"
## [31] "OR1C1" "ZMYM2" "RB1CC1" "RSRC2" "KIAA1468"
## [36] "GSN" "ZBTB1" "SLC26A5" "TBC1D15" "SRSF5"
## [41] "RPS15A" "CNTRL" "DUSP19" "MDM4" "LUC7L3"
## [46] "PDE3B" "FN1" "NAPG" "IFNAR2" "ILF3-AS1"
## [51] "POLI" "PDCD4" "MGEA5" "ZFP1" "NSD3"
## [56] "PNISR" "TRA2A" "SERPINI2" "STN1" "TIGD7"
## [61] "PAN3" "ESM1" "GGNBP2" "XPOT" "NEMF"
## [66] "AADACL2" "HIST1H2BE" "LYAR" "ACAT2" "NAP1L2"
## [71] "NKTR" "RFC1" "TTC17"
length(setdiff(neg_eqtl_de_gns, pos_eqtl_de_gns))
## [1] 73
intersect(pos_eqtl_de_gns, neg_eqtl_de_gns)
## [1] "ADGRG7"
So all differentially expressed genes in each group are unique to that group other than ADGRG7.
| AIBL SNPs | Proxy SNPs | |
|---|---|---|
| Total genotyped | 2084 | |
| No. in LD | 1,527 | 18,118 |
| Loci identified | 2075 | 17,847 |
| Genes | 621 | 1,103 |
| Unique genes (shared genes) | 127 (494) | 609 (494) |
| No. significant SNPs (p < 0.05) | Adjusted for multiple testing | No. genes mapped to the significant SNPs | |
|---|---|---|---|
| Association (chi square, Fisher exact) | 72 | 0 (from 10,000 permutations) | 58 |
| Logistic regression with covariates | 83 | 0 (< 0.05 FDR-adjusted) | 59 |
| Intersection between above analyses | 43 | N/A | 47 |
| No. significant SNPs (p < 0.05 FDR-adjusted) | No. differentially expressed genes | No. of SNPs acting in cis | |
|---|---|---|---|
| Total cohort | 130 SNPs mapped to 101 genes | 2246 | 8 |
| AB-pos only | 28 SNPs mapped to 31 genes | 48 | 0 |
| AB-neg only | 35 SNPs mapped to 34 genes | 74 | 0 |
All differentially expressed genes in the AB-pos and -neg groups were unique to that group other than ADGRG7.
Load data
require(limma)
load("~/Dropbox/eQTL/Results/cluster_data/gene_clusters_expression.RData")
Get END eigenvalues from ‘cluster_genexp’
require(limma)
load("~/Dropbox/eQTL/Results/cluster_data/gene_clusters_expression.RData")
ll <- lapply(cluster_genexp, function (x) {
xm <- as.matrix(log(x[,-1])) # creates a matrix, using log to standardise
svd.decomp2 <- svd(xm) # applies singular value decomposition
svd.v <- svd.decomp2$v # pulls out the matrix whose columns contain the right singular vectors of xm
X.pca <- xm %*% svd.v # multiplies the original 'xm' matrix by the right singular vectors matrix
xout <- X.pca[,1] # takes the first eigemvalue for each individual
return(xout)
})
Now each of the elements in the ‘ll’ list is a vector of 218 eigenvalues (one for each individual). Each element corresponds to one of the 244 networks.
Turn it into a data frame
lld <- do.call(rbind.data.frame,ll) # creates a data frame of eigenvalues
lld2 <- t(lld) # transpose so that individuals are rows
rownames(lld2) <- cluster_genexp[[1]][,1]
colnames(lld2) <- names(cluster_genexp)
Add metadata and gene/cluster annotation info
genemap <- read.csv("~/Dropbox/eQTL/Results/cluster_data/gene_clusters_hsa_map.csv") # note that 5 networks are missing from 'cluster_genexp' - why?
keymeta <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/key_metadata.txt", header = T)
kp <- ifelse(is.na(keymeta$PET)==T,FALSE,TRUE) # create vector of TRUE/FALSE for recorded PET status, to determine which ones are missing
keymeta1 <- keymeta[kp,] # restrict metadata for just those with recorded PET status
keymeta1 <- keymeta1[-c(grep(c("771|914|918"), keymeta1$AIBL.Id)), ] # remove outliers idenfitied with PCA on expression data
# number of unique networks
length(unique(genemap$hsa))
## [1] 249
# nubmer of unique root probe IDs
length(unique(genemap$root_gene_probe_id))
## [1] 104
lld3 <- lld2[as.character(keymeta$AIBL.Id),] # put eigenvalue data in same order as keymeta
kp <- ifelse(is.na(keymeta$PET)==T,FALSE,TRUE) # create vector of TRUE/FALSE for recorded PET status, to determine which ones are missing
keymeta1 <- keymeta[kp,] # restrict metadata for just those with recorded PET status
keymeta1 <- keymeta1[-c(grep(c("771|914|918"), keymeta1$AIBL.Id)), ] # remove outliers idenfitied with PCA on expression data
Y1 <- factor(as.character(keymeta1$PET)) # create factor vector, where NEG = 1, POS = 2
lld4 <- lld3[kp,] # remove individuals that do not have PET status recorded
lld4 <- lld4[-c(grep(c("771|914|918"), rownames(lld4))), ] # remove outliers identified with PCA on expression data
treat1<-factor(c(Y1), labels=c("Normal","Alzheimers")) # create vector of factors
design_treat1<-model.matrix(~0+treat1) # create model matrix
colnames(design_treat1) <- c("Normal","Alzheimers")
contrast.matrix_treat1<-makeContrasts(Normal-Alzheimers, levels=design_treat1) # create contrast
Visualise eigenvalues to see if they separate the groups
require(tidyverse)
lld4.1 <- data.frame(lld4)
lld4.1$AIBL.Id <- as.integer(rownames(lld4.1))
vis <- left_join(lld4.1, keymeta1, by = "AIBL.Id")
vis <- gather(vis, key = "network", value = "eigenvalue", 1:244)
ggplot(vis, aes(network, eigenvalue, colour = PET)) +
geom_jitter() # most networks seeem to have a pretty even distribution between POS and NEG groups
Run limma
# Transpose data into matrix for limma fit
x1 <- t(lld4)
fit.lm1<-lmFit(x1,design_treat1) # fit a linear model using the matrix x1 from above
fit1.c <-contrasts.fit(fit.lm1,contrast.matrix_treat1) # use contrasts made above
fit.eb1<- eBayes(fit1.c) # compute Bayes statistics
## comparison treated - untreated
topALL1<-toptable(fit=fit1.c, eb=fit.eb1, adjust="fdr", n=244, genelist=colnames(lld4))
topALL.1 <- topALL1[,c("ID","P.Value", "adj.P.Val")]
Many of the genes in the cluster_genexp data set weren’t differentially expressed in the inital limma. Let’s filter these and then run the END on that reduced set.
Create vector of unique significant genes in ‘cluster_genexp’ and annotate with p-values from original limma
# create vector of all probe IDs that sit within clusters
clst_gns <- lapply(cluster_genexp, function (x) {
names(x)[-1]
})
# take only unique probe IDs and remove "X" prefix
clst_gns_1 <- unique(as.character(unlist(clst_gns)))
clst_gns_1 <- gsub("X", "", clst_gns_1)
length(clst_gns_1)
## [1] 3434
# create data frame of probe IDs that sit within clusters, with their associated outputs from the original limma
lim1_out <- results_PETposvneg[results_PETposvneg$probe_id %in% clst_gns_1, ]
nrow(lim1_out)
## [1] 3434
So 3434 unique probe IDs sit within the clusters.
Get p-values from original limma for all root genes
root_lim <- results_PETposvneg[results_PETposvneg$probe_id %in% genemap$root_gene_probe_id, ] # take the expression data for root genes
root_lim <- dplyr::select(root_lim, root_probe_id = "probe_id", root_de_p.value = "P.Value", root_de_p.value_adj = "adj.P.Val") # select and rename columns of interest
# convert root_gene_probe_id to character for merging
genemap$root_gene_probe_id <- as.character(genemap$root_gene_probe_id)
# create consolidated table with initial limma results ("root_de_p.value"), cluster info, and END cluster limma results ("end_cluster_p.value")
root_lim <- dplyr::left_join(root_lim,
dplyr::select(genemap,
root_probe_id = "root_gene_probe_id",
root_gene_symbol,
cluster_id = "hsa",
cluster_name = "name"),
by = "root_probe_id") %>%
left_join(dplyr::select(topALL1,
cluster_id = "ID",
end_cluster_p.value = "P.Value",
end_cluster_p.value_adj = "adj.P.Val"),
by = "cluster_id")
## Warning: Column `cluster_id` joining factor and character vector, coercing
## into character vector
# for how many networks does the END limma have a lower p-value than the limma on the root gene?
length(which(root_lim$end_cluster_p.value < root_lim$root_de_p.value))
## [1] 24
So for 24 networks, the p-value from the limma on the END was lower than the p-value for the differential expression of the network’s root gene alone.
lim1_out_sig <- dplyr::filter(lim1_out, P.Value < 0.05) # filter for p-values < 0.05
sig_lim <- unlist(lim1_out_sig$probe_id) # create vector of significant probes
length(sig_lim)
## [1] 168
# remove the "X" from before all the probe names in the 'cluster_genexp' list
clst_tidy <- lapply(cluster_genexp, function (x) {
a <- c(names(x)[1], gsub("X", "", names(x)[-1]))
names(x) <- a
return(x)
})
# shows how many genes in each network were found to be differentially expressed in the initial limma
clst_tidy_1 <- lapply(clst_tidy, function(x) {
length(which(names(x) %in% sig_lim) == T)
})
# take all clusters with more than one gene that was found to be differentially expressed in the initial limma
clst_tidy_2 <- lapply(clst_tidy, function(x) {
if (length(which(names(x) %in% sig_lim) == T) >= 1){
a <- which(names(x) %in% sig_lim)
x[, c(1, a)]
}
})
# remove NULL entries
clst_tidy_3 <- clst_tidy_2[-which(sapply(clst_tidy_2, is.null))]
# how many clusters?
length(clst_tidy_3)
## [1] 199
# how many unique genes in those clusters?
clst_de_gns <- lapply(clst_tidy_3, function (x) {
names(x)[-1]
})
clst_de_gns_1 <- unique(as.character(unlist(clst_de_gns))) # take only unique probe IDs and remove "X" prefix
length(clst_de_gns_1)
## [1] 168
Before running END on this new filtered set, we’ll extract a few of the clusters with the most differentially expressed genes in them to see if the PET-pos and -neg groups separate with PCA.
tail(sort(sapply(clst_tidy_2, length)), 6) # get names of six clusters with the most DE genes
## hsa04151 hsa04510 hsa05225 hsa05165 hsa05205 hsa05200
## 10 10 10 12 12 25
t <- attr(tail(sort(sapply(clst_tidy_2, length)), 6), "names") # create vector of cluster names
# create list of just those six
clst_test <- clst_tidy_3[t]
# run loop over list to create plots for each
cluster_plots <- lapply(names(clst_test), function(namex) {
df <- data.frame(clst_test[[namex]], stringsAsFactors = F)
colnames(df)[1] <- "X"
df1 <- left_join(keymeta1, df, by = c("AIBL.Id" = "X"))
pca1 <- prcomp(df1[6:ncol(df1)], center = T, scale. = T)
library(ggfortify)
plot <- autoplot(pca1, data = df1, colour = "PET", label = T, label.label = "AIBL.Id") +
viridis::scale_colour_viridis(option = "viridis", discrete = T) +
ggtitle(namex)
print(plot)
})
There’s some separation, but not a lot. What if we combine them? That is, we take all the differentially expressed genes that also fell within networks.
# get expression data for the 168 DE genes that fell within the networks
test2 <- cbind(exdata3[, 1:5], exdata3[, sig_lim])
pca2 <- prcomp(test2[6:ncol(test2)], center = T, scale. = T)
autoplot(pca2, data = test2, colour = "PET", label = T, label.label = "AIBL.Id") +
viridis::scale_colour_viridis(option = "viridis", discrete = T)
Seems to be a bit of a dead end.
ll2 <- lapply(clst_tidy_3, function (x) {
xm <- as.matrix(log(x[,-1])) # creates a matrix, using log to standardise
svd.decomp2 <- svd(xm) # applies singular value decomposition
svd.v <- svd.decomp2$v # pulls out the matrix whose columns contain the right singular vectors of xm
X.pca <- xm %*% svd.v # multiplies the original 'xm' matrix by the right singular vectors matrix
xout <- X.pca[,1] # takes the first eigemvalue for each individual
return(xout)
})
Turn it into a data frame
ll2d <- do.call(rbind.data.frame,ll2) # creates a data frame of eigenvalues
ll2d2 <- t(ll2d) # transpose so that individuals are rows
rownames(ll2d2) <- clst_tidy_3[[1]][,1]
colnames(ll2d2) <- names(clst_tidy_3)
ll2d3 <- ll2d2[as.character(keymeta$AIBL.Id),] # put eigenvalue data in same order as keymeta
kp <- ifelse(is.na(keymeta$PET)==T,FALSE,TRUE) # create vector of TRUE/FALSE for recorded PET status, to determine which ones are missing
keymeta1 <- keymeta[kp,] # restrict metadata for just those with recorded PET status
keymeta1 <- keymeta1[-c(grep(c("771|914|918"), keymeta1$AIBL.Id)), ] # remove outliers idenfitied with PCA on expression data
Y1 <- factor(as.character(keymeta1$PET)) # create factor vector, where NEG = 1, POS = 2
ll2d4 <- ll2d3[kp,] # remove individuals that do not have PET status recorded
ll2d4 <- ll2d4[-c(grep(c("771|914|918"), rownames(ll2d4))), ] # remove outliers identified with PCA on expression data
treat1<-factor(c(Y1), labels=c("Normal","Alzheimers")) # create vector of factors
design_treat1<-model.matrix(~0+treat1) # create model matrix
colnames(design_treat1) <- c("Normal","Alzheimers")
contrast.matrix_treat1<-makeContrasts(Normal-Alzheimers, levels=design_treat1) # create contrast
Visualise eigenvalues to see if they separate the groups
require(tidyverse)
ll2d4.1 <- data.frame(ll2d4)
ll2d4.1$AIBL.Id <- as.integer(rownames(ll2d4.1))
vis2 <- left_join(ll2d4.1, keymeta1, by = "AIBL.Id")
vis2 <- gather(vis2, key = "network", value = "eigenvalue", 1:(ncol(vis2) - 5))
ggplot(vis2, aes(network, eigenvalue, colour = PET)) +
geom_jitter() # most networks seeem to have a pretty even distribution between POS and NEG groups
Run limma
# Transpose data into matrix for limma fit
x1 <- t(ll2d4)
fit.lm1<-lmFit(x1,design_treat1) # fit a linear model using the matrix x1 from above
fit1.c <-contrasts.fit(fit.lm1,contrast.matrix_treat1) # use contrasts made above
fit.eb1<- eBayes(fit1.c) # compute Bayes statistics
## comparison treated - untreated
topALL2<-toptable(fit=fit1.c, eb=fit.eb1, adjust="fdr", n=244, genelist=colnames(ll2d4))
topALL.2 <- topALL2[,c("ID","P.Value", "adj.P.Val")]
# number of significantly differentiated networks
length(which(topALL.2$P.Value < 0.05))
## [1] 129
# number of significantly differentiated networks (FDR-adjusted)
length(which(topALL.2$adj.P.Val < 0.05))
## [1] 88
Join results to ‘root_lim’ table
root_lim <- dplyr::left_join(root_lim,
dplyr::select(topALL.2,
cluster_id = "ID",
end_cluster_de_p.value = "P.Value",
end_cluster_de_p.value_adj = "adj.P.Val"),
by = "cluster_id")
# create data frame with attached metadata
fdr_sig <- topALL.2$ID[topALL.2$adj.P.Val < 0.05]# create vector of significant network IDs
ll2d4.2 <- ll2d4.1[, colnames(ll2d4.1) %in% fdr_sig] # restrict to just those networks that are significant at the FDR-adjusted level
ll2d4.2$AIBL.Id <- as.integer(rownames(ll2d4.2))
ll2d4.2 <- left_join(ll2d4.2, keymeta1, by = "AIBL.Id")
ll2d4.2 <- dplyr::select(ll2d4.2, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())
# carry out PCA
pca3 <- prcomp(ll2d4.2[6:ncol(ll2d4.2)], center = T, scale. = T)
autoplot(pca3, data = ll2d4.2, colour = "PET", label = T, label.label = "AIBL.Id") +
viridis::scale_colour_viridis(option = "viridis", discrete = T) +
ggtitle("PCA with significant clusters (FDR-adjusted)")
How about if we just take the top 10 clusters?
# create data frame with attached metadata
fdr_sig_10 <- topALL.2$ID[1:10]# create vector of significant network IDs
ll2d4.3 <- ll2d4.1[, colnames(ll2d4.1) %in% fdr_sig_10] # restrict to just those networks that are significant at the FDR-adjusted level
ll2d4.3$AIBL.Id <- as.integer(rownames(ll2d4.3))
ll2d4.3 <- left_join(ll2d4.3, keymeta1, by = "AIBL.Id")
ll2d4.3 <- dplyr::select(ll2d4.3, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())
# carry out PCA
pca3 <- prcomp(ll2d4.3[6:ncol(ll2d4.3)], center = T, scale. = T)
autoplot(pca3, data = ll2d4.3, colour = "PET", label = T, label.label = "AIBL.Id") +
viridis::scale_colour_viridis(option = "viridis", discrete = T) +
ggtitle("PCA with top 10 significant clusters (FDR-adjusted)")
How about if we just take the top 5 clusters?
# create data frame with attached metadata
fdr_sig_5 <- topALL.2$ID[1:5]# create vector of significant network IDs
ll2d4.4 <- ll2d4.1[, colnames(ll2d4.1) %in% fdr_sig_5] # restrict to just those networks that are significant at the FDR-adjusted level
ll2d4.4$AIBL.Id <- as.integer(rownames(ll2d4.4))
ll2d4.4 <- left_join(ll2d4.4, keymeta1, by = "AIBL.Id")
ll2d4.4 <- dplyr::select(ll2d4.4, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())
# carry out PCA
pca4 <- prcomp(ll2d4.4[6:ncol(ll2d4.4)], center = T, scale. = T)
autoplot(pca4, data = ll2d4.4, colour = "PET", label = T, label.label = "AIBL.Id") +
viridis::scale_colour_viridis(option = "viridis", discrete = T) +
ggtitle("PCA with top 5 significant clusters (FDR-adjusted)")
How about if we just take the top 2 clusters?
# create data frame with attached metadata
fdr_sig_2 <- topALL.2$ID[1:2]# create vector of significant network IDs
ll2d4.5 <- ll2d4.1[, colnames(ll2d4.1) %in% fdr_sig_2] # restrict to just those networks that are significant at the FDR-adjusted level
ll2d4.5$AIBL.Id <- as.integer(rownames(ll2d4.5))
ll2d4.5 <- left_join(ll2d4.5, keymeta1, by = "AIBL.Id")
ll2d4.5 <- dplyr::select(ll2d4.5, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())
# carry out PCA
pca5 <- prcomp(ll2d4.5[6:ncol(ll2d4.5)], center = T, scale. = T)
autoplot(pca5, data = ll2d4.5, colour = "PET", label = T, label.label = "AIBL.Id") +
viridis::scale_colour_viridis(option = "viridis", discrete = T) +
ggtitle("PCA with top 2 significant clusters (FDR-adjusted)")
How about with all clusters at the nominal p-value?
# create data frame with attached metadata
fdr_sig_all <- topALL.2$ID[topALL.2$P.Value < 0.05]# create vector of significant network IDs
ll2d4.6 <- ll2d4.1[, colnames(ll2d4.1) %in% fdr_sig_all] # restrict to just those networks that are significant at the FDR-adjusted level
ll2d4.6$AIBL.Id <- as.integer(rownames(ll2d4.6))
ll2d4.6 <- left_join(ll2d4.6, keymeta1, by = "AIBL.Id")
ll2d4.6 <- dplyr::select(ll2d4.6, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())
# carry out PCA
pca6 <- prcomp(ll2d4.6[6:ncol(ll2d4.6)], center = T, scale. = T)
autoplot(pca6, data = ll2d4.6, colour = "PET", label = T, label.label = "AIBL.Id") +
viridis::scale_colour_viridis(option = "viridis", discrete = T) +
ggtitle("PCA with all significant clusters (nominal p-value)")
Find which of the significant DE genes from the initial limma were also significant from the eQTL analysis of the PET-pos group
which(lim1_out_sig$probe_id %in% pos_sig_eqtl_gns$gene)
## integer(0)
None!
How about the eQTL analysis from the group as a whole?
which(lim1_out_sig$probe_id %in% sig_eqtl_gns$gene)
## [1] 29 53 70 80 82 85 91 104 105 112 130 131 135 141 145 166
# get probe IDs for the DE eQTL genes
eqtl_sig_probes <- lim1_out_sig$probe_id[lim1_out_sig$probe_id %in% sig_eqtl_gns$gene]
# create list of cluster IDs that contain genes that were DE in the eQTL analysis
test <- lapply(clst_tidy, function(x){
if (length(which(names(x) %in% eqtl_sig_probes) >= 1)){
return(x)
}
})
test <- test[-which(sapply(test, is.null))] # remove NULL entries
# create vector of cluster IDs
eqtl_cluster_ids <- names(test)
ll2d4.7 <- ll2d4.1[, colnames(ll2d4.1) %in% eqtl_cluster_ids] # restrict to just those networks that hold genes that were DE in the eQTL analysis
ll2d4.7$AIBL.Id <- as.integer(rownames(ll2d4.7))
ll2d4.7 <- left_join(ll2d4.7, keymeta1, by = "AIBL.Id")
ll2d4.7 <- dplyr::select(ll2d4.7, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())
# carry out PCA
pca7 <- prcomp(ll2d4.7[6:ncol(ll2d4.7)], center = T, scale. = T)
autoplot(pca7, data = ll2d4.7, colour = "PET", label = T, label.label = "AIBL.Id") +
viridis::scale_colour_viridis(option = "viridis", discrete = T) +
ggtitle("PCA differentially expressed eQTL genes")
load(url("https://github.com/IanBrettell/AD_project/raw/master/Working/networks/dingo_filtered_probes.RData"))
load(url("https://github.com/IanBrettell/AD_project/raw/master/Working/networks/filtered_clusters.RData"))
# how many unique probes are there in the filtered iDINGO output?
length(dingo_filtered_probes)
## [1] 3434
9 February 2018
| No. unique genes (probes) sig. at p < 0.05 (annotated) | No. genes (probes) sig. FDR-adjusted level (annotated) | No. networks | |
|---|---|---|---|
| DE in limma (1) | 865 (533) | 0 | 244 |
| Sig. diff. correlation from iDINGO | 3434 (3434) | ? | 249 |
| No. sig. genes from (1) in networks | 168 (168) | NA | 199 |
For Monday:
Increase the threshold for the number of DE genes in each cluster from 1 to say, 5, create eigenvalues, for those, and then see how much better the PCA gets. Or just take the top 10 clusters.
Find which differentially expressed eQTL genes are within the FDR-adjusted significant networks of the limma run on the above.
Create boxplots for those genes by genotype, APOE-e4, and PET status.
13 February 2018
end_eigen <- function(cluster_list){
# create eigenvalues
ll <- lapply(cluster_list, function (x) {
xm <- as.matrix(log(x[,-1])) # creates a matrix, using log to standardise
svd.decomp2 <- svd(xm) # applies singular value decomposition
svd.v <- svd.decomp2$v # pulls out the matrix whose columns contain the right singular vectors of xm
X.pca <- xm %*% svd.v # multiplies the original 'xm' matrix by the right singular vectors matrix
xout <- X.pca[,1] # takes the first eigemvalue for each individual
return(xout)
})
# turn it into a data frame
lld <- do.call(rbind.data.frame,ll) # creates a data frame of eigenvalues
lld2 <- t(lld) # transpose so that individuals are rows
rownames(lld2) <- cluster_list[[1]][,1]
colnames(lld2) <- names(cluster_list)
# filter data and create contrasts
require(RCurl)
keymeta <- read.delim(text = getURL("https://raw.githubusercontent.com/IanBrettell/AD_project/master/Working/key_metadata.txt"), header = T) # read key metadata
lld3 <- lld2[as.character(keymeta$AIBL.Id),] # put eigenvalue data in same order as keymeta
kp <- ifelse(is.na(keymeta$PET)==T,FALSE,TRUE) # create vector of TRUE/FALSE for recorded PET status, to determine which ones are missing
keymeta1 <- keymeta[kp,] # restrict metadata for just those with recorded PET status
keymeta1 <- keymeta1[-c(grep(c("771|914|918"), keymeta1$AIBL.Id)), ] # remove outliers idenfitied with PCA on expression data
Y1 <- factor(as.character(keymeta1$PET)) # create factor vector, where NEG = 1, POS = 2
lld4 <- lld3[kp,] # remove individuals that do not have PET status recorded
lld4 <- lld4[-c(grep(c("771|914|918"), rownames(lld4))), ] # remove outliers identified with PCA on expression data
treat1<-factor(c(Y1), labels=c("Normal","Alzheimers")) # create vector of factors
design_treat1<-model.matrix(~0+treat1) # create model matrix
colnames(design_treat1) <- c("Normal","Alzheimers")
contrast.matrix_treat1<-makeContrasts(Normal-Alzheimers, levels=design_treat1) # create contrast
# run limma
x1 <- t(lld4) # Transpose data into matrix for limma fit
fit.lm1<-lmFit(x1,design_treat1) # fit a linear model using the matrix x1 from above
fit1.c <-contrasts.fit(fit.lm1,contrast.matrix_treat1) # use contrasts made above
fit.eb1<- eBayes(fit1.c) # compute Bayes statistics
topALL1<-toptable(fit=fit1.c, eb=fit.eb1, adjust="fdr", n=length(cluster_list), genelist=colnames(lld4))## comparison treated - untreated
}
# create function for PCA analysis
end_pca_plot <- function(cluster_list, title){
# create eigenvalues
ll <- lapply(cluster_list, function (x) {
xm <- as.matrix(log(x[,-1])) # creates a matrix, using log to standardise
svd.decomp2 <- svd(xm) # applies singular value decomposition
svd.v <- svd.decomp2$v # pulls out the matrix whose columns contain the right singular vectors of xm
X.pca <- xm %*% svd.v # multiplies the original 'xm' matrix by the right singular vectors matrix
xout <- X.pca[,1] # takes the first eigemvalue for each individual
return(xout)
})
# turn it into a data frame
lld <- do.call(rbind.data.frame,ll) # creates a data frame of eigenvalues
lld2 <- t(lld) # transpose so that individuals are rows
rownames(lld2) <- cluster_list[[1]][,1]
colnames(lld2) <- names(cluster_list)
# filter data and create contrasts
keymeta <- read.table("~/Desktop/Uni Air/CSIRO/AD_project/Working/key_metadata.txt", header = T) # read key metadata
lld3 <- lld2[as.character(keymeta$AIBL.Id),] # put eigenvalue data in same order as keymeta
kp <- ifelse(is.na(keymeta$PET)==T,FALSE,TRUE) # create vector of TRUE/FALSE for recorded PET status, to determine which ones are missing
keymeta1 <- keymeta[kp,] # restrict metadata for just those with recorded PET status
keymeta1 <- keymeta1[-c(grep(c("771|914|918"), keymeta1$AIBL.Id)), ] # remove outliers idenfitied with PCA on expression data
Y1 <- factor(as.character(keymeta1$PET)) # create factor vector, where NEG = 1, POS = 2
lld4 <- lld3[kp,] # remove individuals that do not have PET status recorded
lld4 <- lld4[-c(grep(c("771|914|918"), rownames(lld4))), ] # remove outliers identified with PCA on expression data
treat1<-factor(c(Y1), labels=c("Normal","Alzheimers")) # create vector of factors
design_treat1<-model.matrix(~0+treat1) # create model matrix
colnames(design_treat1) <- c("Normal","Alzheimers")
contrast.matrix_treat1<-makeContrasts(Normal-Alzheimers, levels=design_treat1) # create contrast
# run limma
x1 <- t(lld4) # Transpose data into matrix for limma fit
fit.lm1<-lmFit(x1,design_treat1) # fit a linear model using the matrix x1 from above
fit1.c <-contrasts.fit(fit.lm1,contrast.matrix_treat1) # use contrasts made above
fit.eb1<- eBayes(fit1.c) # compute Bayes statistics
topALL1<-toptable(fit=fit1.c, eb=fit.eb1, adjust="fdr", n=length(cluster_list), genelist=colnames(lld4))## comparison treated - untreated
# create data frame with attached metadata
if (length(topALL1$ID[topALL1$adj.P.Val < 0.05]) > 0){
fdr_sig <- topALL1$ID[topALL1$adj.P.Val < 0.05]# create vector of significant network IDs
} else {
fdr_sig <- topALL1$ID[topALL1$P.Value < 0.05]
}
lld4.1 <- data.frame(lld4)
lld4.2 <- lld4.1[, colnames(lld4.1) %in% fdr_sig] # restrict to just those networks that are significant at the FDR-adjusted level
lld4.2$AIBL.Id <- as.integer(rownames(lld4.2))
require(dplyr)
lld4.2 <- left_join(lld4.2, keymeta1, by = "AIBL.Id")
lld4.2 <- dplyr::select(lld4.2, AIBL.Id, Demographic.Sex, PET, apoe4, Age, everything())
# run PCA
pca <- prcomp(lld4.2[6:ncol(lld4.2)], center = T, scale. = T)
# plot
require(ggplot2)
require(ggfortify)
plot <- autoplot(pca, data = lld4.2, colour = "PET", label = T, label.label = "AIBL.Id") +
viridis::scale_colour_viridis(option = "viridis", discrete = T) +
ggtitle(title)
return(plot)
}
tail(sort(sapply(clst_tidy_2, length)), 10) # get names of six clusters with the most DE genes
## hsa05169 hsa04360 hsa04740 hsa05206 hsa04151 hsa04510 hsa05225 hsa05165
## 8 9 9 9 10 10 10 12
## hsa05205 hsa05200
## 12 25
t <- attr(tail(sort(sapply(clst_tidy_2, length)), 10), "names") # create vector of cluster names
# create list of just those six
clst_top_10 <- clst_tidy_3[t]
Run function on ‘clst_top_10’
end_pca_plot(clst_top_10, "PCA using 10 clusters with the most DE genes")
tail(sort(sapply(clst_tidy_2, length)), 5) # get names of six clusters with the most DE genes
## hsa04510 hsa05225 hsa05165 hsa05205 hsa05200
## 10 10 12 12 25
t <- attr(tail(sort(sapply(clst_tidy_2, length)), 5), "names") # create vector of cluster names
# create list of just those six
clst_top_5 <- clst_tidy_3[t]
Run function on ‘clst_top_5’
end_pca_plot(clst_top_5, "PCA using 5 clusters with the most DE genes")
Meeting with JD:
Take average of absolute z-scores. Take 5th percentile from either end. Get mean, median, SD -This will let us know which clusters separate the two groups the best Dingo -Take the average of the bootstrap replications (the difference). The better that does, the smaller the error around the bootstrap average.
Load files
# load files from dropbox
load("~/Dropbox/eQTL/Results/cluster_data/iDINGO_results/RData/All_clusters.RData")
# create list
dingo_clstrs <- mget(ls(pattern = "hsa"))
# remove indidivual cluster lists
rm(list = ls(pattern = "hsa"))
Get the mean of each gene pair’s bootstrap
dingo_clstrs_2 <- lapply(dingo_clstrs, function(x){
apply(x[["bootdiff"]], 1, mean)
})
Find the indexes for the gene pairs that sit in the extreme tails (<2.5%, >97.5%)
dingo_clstrs_3 <- lapply(dingo_clstrs_2, function(x){
which(x < quantile(x, 0.025) | x > quantile(x, 0.975))
})
Get gene pairs corresponding to indexes
# create function to extract gene pairs
ind_match <- function(x, y){
list(x[["genepairs"]][y, ])
}
# run over both lists
dingo_clstrs_4 <- mapply(ind_match, dingo_clstrs, dingo_clstrs_3)
Get unique genes in each network
dingo_clstrs_5 <- lapply(dingo_clstrs_4, function(x){
a <- unique(x[["gene1"]])
b <- unique(x[["gene2"]])
union(a, b)
})
Pull out expression data for those genes
# find which clusters in 'dingo_clstrs_5 aren't in 'cluster_genexp' and remove
dingo_clstrs_6 <- dingo_clstrs_5[which(names(dingo_clstrs_5) %in% names(cluster_genexp))]
# test
# names(cluster_genexp[[2]])[names(cluster_genexp[[2]]) %in% dingo_clstrs_5[[2]]]
# create function to find which genes in 'cluster_genexp' that are in 'dingo_clstrs_6'
gene_match <- function(x, y){
cbind(x[1], x[names(x) %in% y])
}
# extract expression data for genes in 'cluster_genexp' that are in 'dingo_clstrs_6'
dingo_clstrs_7 <- mapply(gene_match, cluster_genexp, dingo_clstrs_6)
end_pca_plot(dingo_clstrs_7, "PCA using END genes (5% threshold)")
Not great clustering. Try higher threshold?
Create function to END dingo analysis
end_filter <- function(list, lower, upper){
# get list of means for the bootstrapped difference
dingo_clstrs_2 <- lapply(list, function(x){
apply(x[["bootdiff"]], 1, mean)
})
# Find the indexes for the gene pairs that sit in the extreme tails
dingo_clstrs_3 <- lapply(dingo_clstrs_2, function(x){
which(x < quantile(x, upper) | x > quantile(x, lower))
})
# get gene pairs corresponding to indexes
ind_match <- function(x, y){ # create function to extract gene pairs
list(x[["genepairs"]][y, ])
}
dingo_clstrs_4 <- mapply(ind_match, dingo_clstrs, dingo_clstrs_3) # run over both lists
# get unique genes in each network
dingo_clstrs_5 <- lapply(dingo_clstrs_4, function(x){
a <- unique(x[["gene1"]])
b <- unique(x[["gene2"]])
union(a, b)
})
# pull out expression data for those genes
dingo_clstrs_6 <- dingo_clstrs_5[which(names(dingo_clstrs_5) %in% names(cluster_genexp))] # find which clusters in 'dingo_clstrs_5 aren't in 'cluster_genexp' and remove
gene_match <- function(x, y){ # create function to find which genes in 'cluster_genexp' that are in 'dingo_clstrs_6'
cbind(x[1], x[names(x) %in% y])
}
dingo_clstrs_7 <- mapply(gene_match, cluster_genexp, dingo_clstrs_6) # extract expression data for genes in 'cluster_genexp' that are in 'dingo_clstrs_6'
# remove clusters with 0 genes
if (any(sapply(dingo_clstrs_7, function(x) length(x) < 2))){
dingo_clstrs_8 <- lapply(dingo_clstrs_7, function(x) { # create NULL entries
if (length(x) < 2){
x <- NULL
} else {
x
}
})
dingo_clstrs_8 <- dingo_clstrs_8[-which(sapply(dingo_clstrs_8, is.null))]
dingo_clstrs_8
} else {
dingo_clstrs_7
}
}
Run with 1% on either side
one_percentile <- end_filter(dingo_clstrs, 0.01, 0.99)
end_pca_plot(one_percentile, "PCA using END genes (1% threshold)")
half_percentile <- end_filter(dingo_clstrs, 0.005, 0.995)
end_pca_plot(half_percentile, "PCA using END genes (0.5% threshold)")
quarter_percentile <- end_filter(dingo_clstrs, 0.0025, 0.9975)
end_pca_plot(quarter_percentile, "PCA using END genes (0.25% threshold)")
point1_percentile <- end_filter(dingo_clstrs, 0.001, 0.999)
end_pca_plot(point1_percentile, "PCA using END genes (0.1% threshold)")
19 February 2018
Meeting with JD and MV
*Create a table of p-values: - one column for the intial limma results for the root gene of the network - one column for the limma results from the END for the networks containing just the genes that were significantly DE in the initial limma - one column for the limma results from the END for the networks containing just the genes that were filtered using the iDINGO results
20 February 2018
# load table of original limma results
sig_PET_pos_v_neg <- read.table(text = getURL("https://raw.githubusercontent.com/IanBrettell/AD_project/master/Working/sig_PET_pos_v_neg.txt"),
header = T)
#select and rename relevant columns
init_lim_table <- sig_PET_pos_v_neg[!is.na(sig_PET_pos_v_neg$gene_symbol), ] %>%
dplyr::select(probe_id, gene_symbol, logFC_init_limma = "logFC", p_value_init_limma = "P.Value", FDR_init_limma = "adj.P.Val")
# convert to character to later merging
init_lim_table$probe_id <- as.character(init_lim_table$probe_id)
# read table with gene-cluster information
gns_in_clstrs <- read.table("~/Dropbox/eQTL/Results/cluster_data/gene_clusters_PC1.csv", sep = ",", header = T)
gns_in_clstrs <- dplyr::rename(gns_in_clstrs, probe_id = "X")
# get names of all the clusters that each gene falls within
hsas <- apply(gns_in_clstrs, 1, function(x) {
colnames(gns_in_clstrs[which(!is.na(x))][-1])
})
names(hsas) <- gns_in_clstrs$probe_id
# turn 'hsas' list into a data frame column
hsas_df <- tibble::enframe(hsas)
# bind with initial limma results
p_table_1 <- left_join(init_lim_table,
dplyr::select(hsas_df, name, networks = "value"),
by = c("probe_id" = "name"))
How many of the initial DE genes fell within netorks?
length(which(sapply(p_table_1$networks, is.null) == F))
## [1] 168
So only 0 of the DE probes from the original limma fell within a network.
Bind with ‘genemap’
genemap$root_gene_probe_id <- as.character(genemap$root_gene_probe_id)
p_table_2 <- dplyr::left_join(p_table_1,
dplyr::select(genemap, root_gene_probe_id, rooted_network = "hsa", name),
by = c("probe_id" = "root_gene_probe_id"))
Get END results for clusters containing DE genes only
# get table of p-values
end_clsts_de_gns_only <- end_eigen(clst_tidy_3)
# join to table
p_table_3 <- dplyr::left_join(p_table_2,
dplyr::select(end_clsts_de_gns_only, ID, logFC_END_DE_only = "logFC", p_value_END_DE_only = "P.Value", FDR_END_DE_only = "adj.P.Val"),
by = c("rooted_network" = "ID"))
For 0.5% threshold
end_results_0.5 <- end_eigen(half_percentile)
# join to table
p_table_4 <- dplyr::left_join(p_table_3,
dplyr::select(end_results_0.5, ID, logFC_END_0.5 = "logFC", p_value_END_0.5 = "P.Value", FDR_END_0.5 = "adj.P.Val"),
by = c("rooted_network" = "ID"))
For 0.25% threshold
end_results_0.25 <- end_eigen(quarter_percentile)
# join to table
p_table_5 <- dplyr::left_join(p_table_4,
dplyr::select(end_results_0.25, ID, logFC_END_0.25 = "logFC", p_value_END_0.25 = "P.Value", FDR_END_0.25 = "adj.P.Val"),
by = c("rooted_network" = "ID"))
For 0.1% threshold
end_results_0.1 <- end_eigen(point1_percentile)
# join to table
p_table_6 <- dplyr::left_join(p_table_5,
dplyr::select(end_results_0.1, ID, logFC_END_0.1 = "logFC", p_value_END_0.1 = "P.Value", FDR_END_0.1 = "adj.P.Val"),
by = c("rooted_network" = "ID"))
For 1% threshold, and filter data for plotting
p_table_7 <- p_table_6[!is.na(p_table_6$rooted_network), ]
# add results for END 1 percentile
end_results_1 <- end_eigen(one_percentile)
p_table_7 <- dplyr::left_join(p_table_7,
dplyr::select(end_results_1, ID, logFC_END_1 = "logFC", p_value_END_1 = "P.Value", FDR_END_1 = "adj.P.Val"),
by = c("rooted_network" = "ID"))
vol_plot <- function(limma_output, xvar = "logFC_column", yvar = "p_value_column", alpha = 0.05, title = "Volcano plot", FDR = c(T, F), label_limit = alpha, label_type = c("gene", "network"), xlim_start = -0.15, xlim_end = 0.2){
# load required packages
require(tidyverse)
require(ggrepel)
# create data frame for manipulation
data <- limma_output
data$xcol <- data[, xvar]
data$ycol <- data[, yvar]
data$sig <- ifelse(data$ycol < alpha, "sig", "not sig") # create column for sign/non-sig
data$ytrans <- -log10(data$ycol) # log transform p-values
if (FDR == T){
lab <- expression(-log[10]("FDR"))
}
if (FDR == F){
lab <- expression(-log[10]("p-value"))
}
# run ggplot
p <- ggplot(data = data,
aes(x = xcol, y = ytrans),
environment = environment()) +
geom_point(aes(col = sig)) +
scale_colour_manual(values = c("sig" = "#F8766D", "not sig" = "#00BFC4")) +
geom_hline(yintercept = -log10(alpha)) +
xlim(xlim_start, xlim_end) +
ylim(0, 4) +
xlab(expression(log[2]("fold change"))) +
ylab(lab) +
ggtitle(paste(title, " (alpha = ", alpha, ")", sep = ""))
# create labels based on either gene or network
ifelse(label_type == "gene",
return(p + geom_text_repel(data = dplyr::filter(data, ycol < label_limit),
aes(label = gene_symbol))),
ifelse(label_type == "network",
return(p + geom_text_repel(data = dplyr::filter(data, ycol < label_limit),
aes(label = name))),
return(p)))
}
plot_END_1 <- vol_plot(p_table_7, xvar = "logFC_END_1", yvar = "FDR_END_1", title = "END with genes in 1 percentile", FDR = T)
## Loading required package: ggrepel
plot_END_0.5 <- vol_plot(p_table_7, xvar = "logFC_END_0.5", yvar = "FDR_END_0.5", title = "END with genes in 0.5 percentile", FDR = T)
plot_END_0.25 <- vol_plot(p_table_7, xvar = "logFC_END_0.25", yvar = "FDR_END_0.25", title = "END with genes in 0.25 percentile", FDR = T)
plot_END_0.1 <- vol_plot(p_table_7, xvar = "logFC_END_0.1", yvar = "FDR_END_0.1", title = "END with genes in 0.1 percentile", FDR = T)
plot_END_1
## Warning: Removed 4 rows containing missing values (geom_point).
plot_END_0.5
## Warning: Removed 4 rows containing missing values (geom_point).
plot_END_0.25
## Warning: Removed 4 rows containing missing values (geom_point).
plot_END_0.1
## Warning: Removed 4 rows containing missing values (geom_point).
vol_plot(p_table_7, xvar = "logFC_init_limma", yvar = "p_value_init_limma", title = "Initial limma: PET-pos v -neg", FDR = F, label_limit = 0.001)
## Warning: Removed 3 rows containing missing values (geom_point).
vol_plot(p_table_7, xvar = "logFC_END_0.5", yvar = "p_value_END_0.5", title = "END with genes in 0.5 percentile", FDR = F, label_limit = 0.001)
## Warning: Removed 4 rows containing missing values (geom_point).
vol_plot(p_table_7, xvar = "logFC_END_DE_only", yvar = "p_value_END_DE_only", title = "END with DE genes only", FDR = F, label_limit = 0.001, label_type = "gene")
## Warning: Removed 59 rows containing missing values (geom_point).
## Warning: Removed 10 rows containing missing values (geom_text_repel).
a <- vol_plot(p_table_7, xvar = "logFC_init_limma", yvar = "FDR_init_limma", title = "Initial limma: PET-pos v -neg", FDR = T)
b <- vol_plot(p_table_7, xvar = "logFC_END_0.5", yvar = "FDR_END_0.5", title = "END with genes in 0.5 percentile", FDR = T)
c <- vol_plot(p_table_7, xvar = "logFC_END_DE_only", yvar = "FDR_END_DE_only", title = "END with DE genes only", FDR = T, label_limit = 0.005, label_type = "gene")
# redo with network names instead
d <- vol_plot(p_table_7, xvar = "logFC_init_limma", yvar = "FDR_init_limma", title = "Initial limma: PET-pos v -neg", FDR = T)
e <- vol_plot(p_table_7, xvar = "logFC_END_0.5", yvar = "FDR_END_0.5", title = "END with genes in 0.5 percentile", FDR = T, label_type = "network")
f <- vol_plot(p_table_7, xvar = "logFC_END_DE_only", yvar = "FDR_END_DE_only", title = "END with DE genes only", FDR = T, label_limit = 0.005, label_type = "network")
a
## Warning: Removed 3 rows containing missing values (geom_point).
b
## Warning: Removed 4 rows containing missing values (geom_point).
c
## Warning: Removed 49 rows containing missing values (geom_point).
d
## Warning: Removed 3 rows containing missing values (geom_point).
e
## Warning: Removed 4 rows containing missing values (geom_point).
f
## Warning: Removed 49 rows containing missing values (geom_point).